First add

0fccd232 · Rayyyyy · 0fccd232 · 0fccd232 · 0fccd232 · 0fccd232
Commit 0fccd232 authored May 27, 2024 by Rayyyyy
20 changed files
--- a/sentence_transformers/cross_encoder/evaluation/CEF1Evaluator.py
+++ b/sentence_transformers/cross_encoder/evaluation/CEF1Evaluator.py
+import csv
+import logging
+import os
+from typing import List
+
+import numpy as np
+from .. import CrossEncoder
+from ... import InputExample
+from sklearn.metrics import f1_score
+
+logger = logging.getLogger(__name__)
+
+
+class CEF1Evaluator:
+    """
+    CrossEncoder F1 score based evaluator for binary and multiclass tasks.
+
+    The task type (binary or multiclass) is determined from the labels array. For
+    binary tasks the returned metric is binary F1 score. For the multiclass tasks
+    the returned metric is macro F1 score.
+
+    :param sentence_pairs: A list of sentence pairs, where each pair is a list of two strings.
+    :type sentence_pairs: list[list[str]]
+    :param labels: A list of integer labels corresponding to each sentence pair.
+    :type labels: list[int]
+    :param batch_size: Batch size for prediction. Defaults to 32.
+    :type batch_size: int
+    :param show_progress_bar: Show tqdm progress bar.
+    :type show_progress_bar: bool
+    :param name: An optional name for the CSV file with stored results. Defaults to an empty string.
+    :type name: str, optional
+    :param write_csv: Flag to determine if the data should be saved to a CSV file. Defaults to True.
+    :type write_csv: bool, optional
+    """
+
+    def __init__(
+        self,
+        sentence_pairs: List[List[str]],
+        labels: List[int],
+        *,
+        batch_size: int = 32,
+        show_progress_bar: bool = False,
+        name: str = "",
+        write_csv: bool = True,
+    ):
+        self.sentence_pairs = sentence_pairs
+        self.labels = labels
+        self.batch_size = batch_size
+        self.show_progress_bar = show_progress_bar
+        self.name = name
+        self.write_csv = write_csv
+
+        n_unique = np.unique(labels).size
+
+        if n_unique == 2:
+            self.f1_callables = [
+                ("Binary F1 score", lambda x, y: f1_score(x, y, average="binary")),
+            ]
+        elif n_unique > 2:
+            self.f1_callables = [
+                ("Macro F1 score", lambda x, y: f1_score(x, y, average="macro")),
+                ("Micro F1 score", lambda x, y: f1_score(x, y, average="micro")),
+                ("Weighted F1 score", lambda x, y: f1_score(x, y, average="weighted")),
+            ]
+        else:
+            raise ValueError(
+                "Got only one distinct label. Please make sure there are at least two labels in the `labels` array."
+            )
+
+        self.csv_file = "CEF1Evaluator" + (f"_{name}" if name else "") + "_results.csv"
+        self.csv_headers = ["epoch", "steps"] + [metric_name for metric_name, _ in self.f1_callables]
+
+    @classmethod
+    def from_input_examples(cls, examples: List[InputExample], **kwargs):
+        sentence_pairs = []
+        labels = []
+
+        for example in examples:
+            sentence_pairs.append(example.texts)
+            labels.append(example.label)
+
+        return cls(sentence_pairs, labels, **kwargs)
+
+    def __call__(
+        self,
+        model: CrossEncoder,
+        output_path: str = None,
+        epoch: int = -1,
+        steps: int = -1,
+    ) -> float:
+        if epoch != -1:
+            if steps == -1:
+                out_txt = f"after epoch {epoch}:"
+            else:
+                out_txt = f"in epoch {epoch} after {steps} steps:"
+        else:
+            out_txt = ":"
+
+        logger.info(f"CEF1Evaluator: Evaluating the model on {self.name} dataset {out_txt}")
+        pred_scores = model.predict(
+            self.sentence_pairs,
+            batch_size=self.batch_size,
+            show_progress_bar=self.show_progress_bar,
+            convert_to_numpy=True,
+        )
+        pred_labels = np.argmax(pred_scores, axis=1)
+
+        assert len(pred_labels) == len(self.labels)
+
+        save_f1 = []
+        for f1_name, f1_fn in self.f1_callables:
+            f1_val = f1_fn(pred_labels, self.labels)
+            save_f1.append(f1_val)
+            logger.info(f"{f1_name:20s}: {f1_val * 100:.2f}")
+
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            output_file_exists = os.path.isfile(csv_path)
+            mode = "a" if output_file_exists else "w"
+            with open(csv_path, mode=mode, encoding="utf-8") as f:
+                writer = csv.writer(f)
+                if not output_file_exists:
+                    writer.writerow(self.csv_headers)
+
+                writer.writerow([epoch, steps, *save_f1])
+
+        return save_f1[0]
--- a/sentence_transformers/cross_encoder/evaluation/CERerankingEvaluator.py
+++ b/sentence_transformers/cross_encoder/evaluation/CERerankingEvaluator.py
+import logging
+import numpy as np
+import os
+import csv
+from typing import Optional
+from sklearn.metrics import ndcg_score
+
+logger = logging.getLogger(__name__)
+
+
+class CERerankingEvaluator:
+    """
+    This class evaluates a CrossEncoder model for the task of re-ranking.
+
+    Given a query and a list of documents, it computes the score [query, doc_i] for all possible
+    documents and sorts them in decreasing order. Then, MRR@10 and NDCG@10 are computed to measure the quality of the ranking.
+
+    :param samples: Must be a list and each element is of the form: {'query': '', 'positive': [], 'negative': []}. Query is the search query,
+     positive is a list of positive (relevant) documents, negative is a list of negative (irrelevant) documents.
+    """
+
+    def __init__(
+        self, samples, at_k: int = 10, name: str = "", write_csv: bool = True, mrr_at_k: Optional[int] = None
+    ):
+        self.samples = samples
+        self.name = name
+        if mrr_at_k is not None:
+            logger.warning(f"The `mrr_at_k` parameter has been deprecated; please use `at_k={mrr_at_k}` instead.")
+            self.at_k = mrr_at_k
+        else:
+            self.at_k = at_k
+
+        if isinstance(self.samples, dict):
+            self.samples = list(self.samples.values())
+
+        self.csv_file = "CERerankingEvaluator" + ("_" + name if name else "") + f"_results_@{self.at_k}.csv"
+        self.csv_headers = [
+            "epoch",
+            "steps",
+            "MRR@{}".format(self.at_k),
+            "NDCG@{}".format(self.at_k),
+        ]
+        self.write_csv = write_csv
+
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        if epoch != -1:
+            if steps == -1:
+                out_txt = " after epoch {}:".format(epoch)
+            else:
+                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
+        else:
+            out_txt = ":"
+
+        logger.info("CERerankingEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
+
+        all_mrr_scores = []
+        all_ndcg_scores = []
+        num_queries = 0
+        num_positives = []
+        num_negatives = []
+        for instance in self.samples:
+            query = instance["query"]
+            positive = list(instance["positive"])
+            negative = list(instance["negative"])
+            docs = positive + negative
+            is_relevant = [1] * len(positive) + [0] * len(negative)
+
+            if len(positive) == 0 or len(negative) == 0:
+                continue
+
+            num_queries += 1
+            num_positives.append(len(positive))
+            num_negatives.append(len(negative))
+
+            model_input = [[query, doc] for doc in docs]
+            pred_scores = model.predict(model_input, convert_to_numpy=True, show_progress_bar=False)
+            pred_scores_argsort = np.argsort(-pred_scores)  # Sort in decreasing order
+
+            mrr_score = 0
+            for rank, index in enumerate(pred_scores_argsort[0 : self.at_k]):
+                if is_relevant[index]:
+                    mrr_score = 1 / (rank + 1)
+                    break
+
+            all_mrr_scores.append(mrr_score)
+            all_ndcg_scores.append(ndcg_score([is_relevant], [pred_scores], k=self.at_k))
+
+        mean_mrr = np.mean(all_mrr_scores)
+        mean_ndcg = np.mean(all_ndcg_scores)
+
+        logger.info(
+            "Queries: {} \t Positives: Min {:.1f}, Mean {:.1f}, Max {:.1f} \t Negatives: Min {:.1f}, Mean {:.1f}, Max {:.1f}".format(
+                num_queries,
+                np.min(num_positives),
+                np.mean(num_positives),
+                np.max(num_positives),
+                np.min(num_negatives),
+                np.mean(num_negatives),
+                np.max(num_negatives),
+            )
+        )
+        logger.info("MRR@{}: {:.2f}".format(self.at_k, mean_mrr * 100))
+        logger.info("NDCG@{}: {:.2f}".format(self.at_k, mean_ndcg * 100))
+
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            output_file_exists = os.path.isfile(csv_path)
+            with open(csv_path, mode="a" if output_file_exists else "w", encoding="utf-8") as f:
+                writer = csv.writer(f)
+                if not output_file_exists:
+                    writer.writerow(self.csv_headers)
+
+                writer.writerow([epoch, steps, mean_mrr, mean_ndcg])
+
+        return mean_mrr
--- a/sentence_transformers/cross_encoder/evaluation/CESoftmaxAccuracyEvaluator.py
+++ b/sentence_transformers/cross_encoder/evaluation/CESoftmaxAccuracyEvaluator.py
+import logging
+import os
+import csv
+from typing import List
+from ... import InputExample
+import numpy as np
+
+
+logger = logging.getLogger(__name__)
+
+
+class CESoftmaxAccuracyEvaluator:
+    """
+    This evaluator can be used with the CrossEncoder class.
+
+    It is designed for CrossEncoders with 2 or more outputs. It measure the
+    accuracy of the predict class vs. the gold labels.
+    """
+
+    def __init__(self, sentence_pairs: List[List[str]], labels: List[int], name: str = "", write_csv: bool = True):
+        self.sentence_pairs = sentence_pairs
+        self.labels = labels
+        self.name = name
+
+        self.csv_file = "CESoftmaxAccuracyEvaluator" + ("_" + name if name else "") + "_results.csv"
+        self.csv_headers = ["epoch", "steps", "Accuracy"]
+        self.write_csv = write_csv
+
+    @classmethod
+    def from_input_examples(cls, examples: List[InputExample], **kwargs):
+        sentence_pairs = []
+        labels = []
+
+        for example in examples:
+            sentence_pairs.append(example.texts)
+            labels.append(example.label)
+        return cls(sentence_pairs, labels, **kwargs)
+
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        if epoch != -1:
+            if steps == -1:
+                out_txt = " after epoch {}:".format(epoch)
+            else:
+                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
+        else:
+            out_txt = ":"
+
+        logger.info("CESoftmaxAccuracyEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
+        pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False)
+        pred_labels = np.argmax(pred_scores, axis=1)
+
+        assert len(pred_labels) == len(self.labels)
+
+        acc = np.sum(pred_labels == self.labels) / len(self.labels)
+
+        logger.info("Accuracy: {:.2f}".format(acc * 100))
+
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            output_file_exists = os.path.isfile(csv_path)
+            with open(csv_path, mode="a" if output_file_exists else "w", encoding="utf-8") as f:
+                writer = csv.writer(f)
+                if not output_file_exists:
+                    writer.writerow(self.csv_headers)
+
+                writer.writerow([epoch, steps, acc])
+
+        return acc
--- a/sentence_transformers/cross_encoder/evaluation/__init__.py
+++ b/sentence_transformers/cross_encoder/evaluation/__init__.py
+from .CEBinaryAccuracyEvaluator import CEBinaryAccuracyEvaluator
+from .CEBinaryClassificationEvaluator import CEBinaryClassificationEvaluator
+from .CEF1Evaluator import CEF1Evaluator
+from .CECorrelationEvaluator import CECorrelationEvaluator
+from .CESoftmaxAccuracyEvaluator import CESoftmaxAccuracyEvaluator
+from .CERerankingEvaluator import CERerankingEvaluator
+
+__all__ = [
+    "CEBinaryAccuracyEvaluator",
+    "CEBinaryClassificationEvaluator",
+    "CECorrelationEvaluator",
+    "CEF1Evaluator",
+    "CESoftmaxAccuracyEvaluator",
+    "CERerankingEvaluator",
+]
--- a/sentence_transformers/datasets/DenoisingAutoEncoderDataset.py
+++ b/sentence_transformers/datasets/DenoisingAutoEncoderDataset.py
+from torch.utils.data import Dataset
+from typing import List
+from ..readers.InputExample import InputExample
+import numpy as np
+from transformers.utils.import_utils import is_nltk_available, NLTK_IMPORT_ERROR
+
+
+class DenoisingAutoEncoderDataset(Dataset):
+    """
+    The DenoisingAutoEncoderDataset returns InputExamples in the format: texts=[noise_fn(sentence), sentence]
+    It is used in combination with the DenoisingAutoEncoderLoss: Here, a decoder tries to re-construct the
+    sentence without noise.
+
+    :param sentences: A list of sentences
+    :param noise_fn: A noise function: Given a string, it returns a string with noise, e.g. deleted words
+    """
+
+    def __init__(self, sentences: List[str], noise_fn=lambda s: DenoisingAutoEncoderDataset.delete(s)):
+        if not is_nltk_available():
+            raise ImportError(NLTK_IMPORT_ERROR.format(self.__class__.__name__))
+
+        self.sentences = sentences
+        self.noise_fn = noise_fn
+
+    def __getitem__(self, item):
+        sent = self.sentences[item]
+        return InputExample(texts=[self.noise_fn(sent), sent])
+
+    def __len__(self):
+        return len(self.sentences)
+
+    # Deletion noise.
+    @staticmethod
+    def delete(text, del_ratio=0.6):
+        from nltk import word_tokenize, TreebankWordDetokenizer
+
+        words = word_tokenize(text)
+        n = len(words)
+        if n == 0:
+            return text
+
+        keep_or_not = np.random.rand(n) > del_ratio
+        if sum(keep_or_not) == 0:
+            keep_or_not[np.random.choice(n)] = True  # guarantee that at least one word remains
+        words_processed = TreebankWordDetokenizer().detokenize(np.array(words)[keep_or_not])
+        return words_processed
--- a/sentence_transformers/datasets/NoDuplicatesDataLoader.py
+++ b/sentence_transformers/datasets/NoDuplicatesDataLoader.py
+import random
+import math
+
+
+class NoDuplicatesDataLoader:
+    def __init__(self, train_examples, batch_size):
+        """
+        A special data loader to be used with MultipleNegativesRankingLoss.
+        The data loader ensures that there are no duplicate sentences within the same batch
+        """
+        self.batch_size = batch_size
+        self.data_pointer = 0
+        self.collate_fn = None
+        self.train_examples = train_examples
+        random.shuffle(self.train_examples)
+
+    def __iter__(self):
+        for _ in range(self.__len__()):
+            batch = []
+            texts_in_batch = set()
+
+            while len(batch) < self.batch_size:
+                example = self.train_examples[self.data_pointer]
+
+                valid_example = True
+                for text in example.texts:
+                    if text.strip().lower() in texts_in_batch:
+                        valid_example = False
+                        break
+
+                if valid_example:
+                    batch.append(example)
+                    for text in example.texts:
+                        texts_in_batch.add(text.strip().lower())
+
+                self.data_pointer += 1
+                if self.data_pointer >= len(self.train_examples):
+                    self.data_pointer = 0
+                    random.shuffle(self.train_examples)
+
+            yield self.collate_fn(batch) if self.collate_fn is not None else batch
+
+    def __len__(self):
+        return math.floor(len(self.train_examples) / self.batch_size)
--- a/sentence_transformers/datasets/ParallelSentencesDataset.py
+++ b/sentence_transformers/datasets/ParallelSentencesDataset.py
+from torch.utils.data import Dataset
+import logging
+import gzip
+from .. import SentenceTransformer
+from ..readers import InputExample
+from typing import List
+import random
+
+
+logger = logging.getLogger(__name__)
+
+
+class ParallelSentencesDataset(Dataset):
+    """
+    This dataset reader can be used to read-in parallel sentences, i.e., it reads in a file with tab-seperated sentences with the same
+    sentence in different languages. For example, the file can look like this (EN\tDE\tES):
+    hello world     hallo welt  hola mundo
+    second sentence zweiter satz    segunda oración
+
+    The sentence in the first column will be mapped to a sentence embedding using the given the embedder. For example,
+    embedder is a mono-lingual sentence embedding method for English. The sentences in the other languages will also be
+    mapped to this English sentence embedding.
+
+    When getting a sample from the dataset, we get one sentence with the according sentence embedding for this sentence.
+
+    teacher_model can be any class that implement an encode function. The encode function gets a list of sentences and
+    returns a list of sentence embeddings
+    """
+
+    def __init__(
+        self,
+        student_model: SentenceTransformer,
+        teacher_model: SentenceTransformer,
+        batch_size: int = 8,
+        use_embedding_cache: bool = True,
+    ):
+        """
+        Parallel sentences dataset reader to train student model given a teacher model
+
+        :param student_model: Student sentence embedding model that should be trained
+        :param teacher_model: Teacher model, that provides the sentence embeddings for the first column in the dataset file
+        """
+        self.student_model = student_model
+        self.teacher_model = teacher_model
+        self.datasets = []
+        self.datasets_iterator = []
+        self.datasets_tokenized = []
+        self.dataset_indices = []
+        self.copy_dataset_indices = []
+        self.cache = []
+        self.batch_size = batch_size
+        self.use_embedding_cache = use_embedding_cache
+        self.embedding_cache = {}
+        self.num_sentences = 0
+
+    def load_data(self, filepath: str, weight: int = 100, max_sentences: int = None, max_sentence_length: int = 128):
+        """
+        Reads in a tab-seperated .txt/.csv/.tsv or .gz file. The different columns contain the different translations of the sentence in the first column
+
+        :param filepath: Filepath to the file
+        :param weight: If more than one dataset is loaded with load_data: With which frequency should data be sampled from this dataset?
+        :param max_sentences: Max number of lines to be read from filepath
+        :param max_sentence_length: Skip the example if one of the sentences is has more characters than max_sentence_length
+        :param batch_size: Size for encoding parallel sentences
+        :return:
+        """
+
+        logger.info("Load " + filepath)
+        parallel_sentences = []
+
+        with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
+            filepath, encoding="utf8"
+        ) as fIn:
+            count = 0
+            for line in fIn:
+                sentences = line.strip().split("\t")
+                if (
+                    max_sentence_length is not None
+                    and max_sentence_length > 0
+                    and max([len(sent) for sent in sentences]) > max_sentence_length
+                ):
+                    continue
+
+                parallel_sentences.append(sentences)
+                count += 1
+                if max_sentences is not None and max_sentences > 0 and count >= max_sentences:
+                    break
+        self.add_dataset(
+            parallel_sentences, weight=weight, max_sentences=max_sentences, max_sentence_length=max_sentence_length
+        )
+
+    def add_dataset(
+        self,
+        parallel_sentences: List[List[str]],
+        weight: int = 100,
+        max_sentences: int = None,
+        max_sentence_length: int = 128,
+    ):
+        sentences_map = {}
+        for sentences in parallel_sentences:
+            if (
+                max_sentence_length is not None
+                and max_sentence_length > 0
+                and max([len(sent) for sent in sentences]) > max_sentence_length
+            ):
+                continue
+
+            source_sentence = sentences[0]
+            if source_sentence not in sentences_map:
+                sentences_map[source_sentence] = set()
+
+            for sent in sentences:
+                sentences_map[source_sentence].add(sent)
+
+            if max_sentences is not None and max_sentences > 0 and len(sentences_map) >= max_sentences:
+                break
+
+        if len(sentences_map) == 0:
+            return
+
+        self.num_sentences += sum([len(sentences_map[sent]) for sent in sentences_map])
+
+        dataset_id = len(self.datasets)
+        self.datasets.append(list(sentences_map.items()))
+        self.datasets_iterator.append(0)
+        self.dataset_indices.extend([dataset_id] * weight)
+
+    def generate_data(self):
+        source_sentences_list = []
+        target_sentences_list = []
+        for data_idx in self.dataset_indices:
+            src_sentence, trg_sentences = self.next_entry(data_idx)
+            source_sentences_list.append(src_sentence)
+            target_sentences_list.append(trg_sentences)
+
+        # Generate embeddings
+        src_embeddings = self.get_embeddings(source_sentences_list)
+
+        for src_embedding, trg_sentences in zip(src_embeddings, target_sentences_list):
+            for trg_sentence in trg_sentences:
+                self.cache.append(InputExample(texts=[trg_sentence], label=src_embedding))
+
+        random.shuffle(self.cache)
+
+    def next_entry(self, data_idx):
+        source, target_sentences = self.datasets[data_idx][self.datasets_iterator[data_idx]]
+
+        self.datasets_iterator[data_idx] += 1
+        if self.datasets_iterator[data_idx] >= len(self.datasets[data_idx]):  # Restart iterator
+            self.datasets_iterator[data_idx] = 0
+            random.shuffle(self.datasets[data_idx])
+
+        return source, target_sentences
+
+    def get_embeddings(self, sentences):
+        if not self.use_embedding_cache:
+            return self.teacher_model.encode(
+                sentences, batch_size=self.batch_size, show_progress_bar=False, convert_to_numpy=True
+            )
+
+        # Use caching
+        new_sentences = []
+        for sent in sentences:
+            if sent not in self.embedding_cache:
+                new_sentences.append(sent)
+
+        if len(new_sentences) > 0:
+            new_embeddings = self.teacher_model.encode(
+                new_sentences, batch_size=self.batch_size, show_progress_bar=False, convert_to_numpy=True
+            )
+            for sent, embedding in zip(new_sentences, new_embeddings):
+                self.embedding_cache[sent] = embedding
+
+        return [self.embedding_cache[sent] for sent in sentences]
+
+    def __len__(self):
+        return self.num_sentences
+
+    def __getitem__(self, idx):
+        if len(self.cache) == 0:
+            self.generate_data()
+
+        return self.cache.pop()
--- a/sentence_transformers/datasets/SentenceLabelDataset.py
+++ b/sentence_transformers/datasets/SentenceLabelDataset.py
+""" """
+
+from torch.utils.data import IterableDataset
+import numpy as np
+from typing import List
+from ..readers import InputExample
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class SentenceLabelDataset(IterableDataset):
+    """
+    This dataset can be used for some specific Triplet Losses like BATCH_HARD_TRIPLET_LOSS which requires
+    multiple examples with the same label in a batch.
+
+    It draws n consecutive, random and unique samples from one label at a time. This is repeated for each label.
+
+    Labels with fewer than n unique samples are ignored.
+    This also applied to drawing without replacement, once less than n samples remain for a label, it is skipped.
+
+    This *DOES NOT* check if there are more labels than the batch is large or if the batch size is divisible
+    by the samples drawn per label.
+    """
+
+    def __init__(self, examples: List[InputExample], samples_per_label: int = 2, with_replacement: bool = False):
+        """
+        Creates a LabelSampler for a SentenceLabelDataset.
+
+        :param examples:
+            a list with InputExamples
+        :param samples_per_label:
+            the number of consecutive, random and unique samples drawn per label. Batch size should be a multiple of samples_per_label
+        :param with_replacement:
+            if this is True, then each sample is drawn at most once (depending on the total number of samples per label).
+            if this is False, then one sample can be drawn in multiple draws, but still not multiple times in the same
+            drawing.
+        """
+        super().__init__()
+
+        self.samples_per_label = samples_per_label
+
+        # Group examples by label
+        label2ex = {}
+        for example in examples:
+            if example.label not in label2ex:
+                label2ex[example.label] = []
+            label2ex[example.label].append(example)
+
+        # Include only labels with at least 2 examples
+        self.grouped_inputs = []
+        self.groups_right_border = []
+        num_labels = 0
+
+        for label, label_examples in label2ex.items():
+            if len(label_examples) >= self.samples_per_label:
+                self.grouped_inputs.extend(label_examples)
+                self.groups_right_border.append(
+                    len(self.grouped_inputs)
+                )  # At which position does this label group / bucket end?
+                num_labels += 1
+
+        self.label_range = np.arange(num_labels)
+        self.with_replacement = with_replacement
+        np.random.shuffle(self.label_range)
+
+        logger.info(
+            "SentenceLabelDataset: {} examples, from which {} examples could be used (those labels appeared at least {} times). {} different labels found.".format(
+                len(examples), len(self.grouped_inputs), self.samples_per_label, num_labels
+            )
+        )
+
+    def __iter__(self):
+        label_idx = 0
+        count = 0
+        already_seen = {}
+        while count < len(self.grouped_inputs):
+            label = self.label_range[label_idx]
+            if label not in already_seen:
+                already_seen[label] = set()
+
+            left_border = 0 if label == 0 else self.groups_right_border[label - 1]
+            right_border = self.groups_right_border[label]
+
+            if self.with_replacement:
+                selection = np.arange(left_border, right_border)
+            else:
+                selection = [i for i in np.arange(left_border, right_border) if i not in already_seen[label]]
+
+            if len(selection) >= self.samples_per_label:
+                for element_idx in np.random.choice(selection, self.samples_per_label, replace=False):
+                    count += 1
+                    already_seen[label].add(element_idx)
+                    yield self.grouped_inputs[element_idx]
+
+            label_idx += 1
+            if label_idx >= len(self.label_range):
+                label_idx = 0
+                already_seen = {}
+                np.random.shuffle(self.label_range)
+
+    def __len__(self):
+        return len(self.grouped_inputs)
--- a/sentence_transformers/datasets/SentencesDataset.py
+++ b/sentence_transformers/datasets/SentencesDataset.py
+from torch.utils.data import Dataset
+from typing import List
+from .. import SentenceTransformer
+from ..readers.InputExample import InputExample
+
+
+class SentencesDataset(Dataset):
+    """
+    DEPRECATED: This class is no longer used. Instead of wrapping your List of InputExamples in a SentencesDataset
+    and then passing it to the DataLoader, you can pass the list of InputExamples directly to the dataset loader.
+    """
+
+    def __init__(self, examples: List[InputExample], model: SentenceTransformer):
+        self.examples = examples
+
+    def __getitem__(self, item):
+        return self.examples[item]
+
+    def __len__(self):
+        return len(self.examples)
--- a/sentence_transformers/datasets/__init__.py
+++ b/sentence_transformers/datasets/__init__.py
+from .DenoisingAutoEncoderDataset import DenoisingAutoEncoderDataset
+from .NoDuplicatesDataLoader import NoDuplicatesDataLoader
+from .ParallelSentencesDataset import ParallelSentencesDataset
+from .SentencesDataset import SentencesDataset
+from .SentenceLabelDataset import SentenceLabelDataset
+
+__all__ = [
+    "DenoisingAutoEncoderDataset",
+    "NoDuplicatesDataLoader",
+    "ParallelSentencesDataset",
+    "SentencesDataset",
+    "SentenceLabelDataset",
+]
--- a/sentence_transformers/evaluation/BinaryClassificationEvaluator.py
+++ b/sentence_transformers/evaluation/BinaryClassificationEvaluator.py
+from . import SentenceEvaluator
+import logging
+import os
+import csv
+from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
+from sklearn.metrics import average_precision_score
+import numpy as np
+from typing import List
+from ..readers import InputExample
+
+
+logger = logging.getLogger(__name__)
+
+
+class BinaryClassificationEvaluator(SentenceEvaluator):
+    """
+    Evaluate a model based on the similarity of the embeddings by calculating the accuracy of identifying similar and
+    dissimilar sentences.
+    The metrics are the cosine similarity as well as euclidean and Manhattan distance
+    The returned score is the accuracy with a specified metric.
+
+    The results are written in a CSV. If a CSV already exists, then values are appended.
+
+    The labels need to be 0 for dissimilar pairs and 1 for similar pairs.
+
+    :param sentences1: The first column of sentences
+    :param sentences2: The second column of sentences
+    :param labels: labels[i] is the label for the pair (sentences1[i], sentences2[i]). Must be 0 or 1
+    :param name: Name for the output
+    :param batch_size: Batch size used to compute embeddings
+    :param show_progress_bar: If true, prints a progress bar
+    :param write_csv: Write results to a CSV file
+    """
+
+    def __init__(
+        self,
+        sentences1: List[str],
+        sentences2: List[str],
+        labels: List[int],
+        name: str = "",
+        batch_size: int = 32,
+        show_progress_bar: bool = False,
+        write_csv: bool = True,
+    ):
+        self.sentences1 = sentences1
+        self.sentences2 = sentences2
+        self.labels = labels
+
+        assert len(self.sentences1) == len(self.sentences2)
+        assert len(self.sentences1) == len(self.labels)
+        for label in labels:
+            assert label == 0 or label == 1
+
+        self.write_csv = write_csv
+        self.name = name
+        self.batch_size = batch_size
+        if show_progress_bar is None:
+            show_progress_bar = (
+                logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
+            )
+        self.show_progress_bar = show_progress_bar
+
+        self.csv_file = "binary_classification_evaluation" + ("_" + name if name else "") + "_results.csv"
+        self.csv_headers = [
+            "epoch",
+            "steps",
+            "cossim_accuracy",
+            "cossim_accuracy_threshold",
+            "cossim_f1",
+            "cossim_precision",
+            "cossim_recall",
+            "cossim_f1_threshold",
+            "cossim_ap",
+            "manhattan_accuracy",
+            "manhattan_accuracy_threshold",
+            "manhattan_f1",
+            "manhattan_precision",
+            "manhattan_recall",
+            "manhattan_f1_threshold",
+            "manhattan_ap",
+            "euclidean_accuracy",
+            "euclidean_accuracy_threshold",
+            "euclidean_f1",
+            "euclidean_precision",
+            "euclidean_recall",
+            "euclidean_f1_threshold",
+            "euclidean_ap",
+            "dot_accuracy",
+            "dot_accuracy_threshold",
+            "dot_f1",
+            "dot_precision",
+            "dot_recall",
+            "dot_f1_threshold",
+            "dot_ap",
+        ]
+
+    @classmethod
+    def from_input_examples(cls, examples: List[InputExample], **kwargs):
+        sentences1 = []
+        sentences2 = []
+        scores = []
+
+        for example in examples:
+            sentences1.append(example.texts[0])
+            sentences2.append(example.texts[1])
+            scores.append(example.label)
+        return cls(sentences1, sentences2, scores, **kwargs)
+
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        if epoch != -1:
+            if steps == -1:
+                out_txt = f" after epoch {epoch}:"
+            else:
+                out_txt = f" in epoch {epoch} after {steps} steps:"
+        else:
+            out_txt = ":"
+
+        logger.info("Binary Accuracy Evaluation of the model on " + self.name + " dataset" + out_txt)
+
+        scores = self.compute_metrices(model)
+
+        # Main score is the max of Average Precision (AP)
+        main_score = max(scores[short_name]["ap"] for short_name in scores)
+
+        file_output_data = [epoch, steps]
+
+        for header_name in self.csv_headers:
+            if "_" in header_name:
+                sim_fct, metric = header_name.split("_", maxsplit=1)
+                file_output_data.append(scores[sim_fct][metric])
+
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            if not os.path.isfile(csv_path):
+                with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
+                    writer = csv.writer(f)
+                    writer.writerow(self.csv_headers)
+                    writer.writerow(file_output_data)
+            else:
+                with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
+                    writer = csv.writer(f)
+                    writer.writerow(file_output_data)
+
+        return main_score
+
+    def compute_metrices(self, model):
+        try:
+            # If the sentences are hashable, then we can use a set to avoid embedding the same sentences multiple times
+            sentences = list(set(self.sentences1 + self.sentences2))
+            embeddings = model.encode(
+                sentences, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
+            )
+            emb_dict = {sent: emb for sent, emb in zip(sentences, embeddings)}
+            embeddings1 = [emb_dict[sent] for sent in self.sentences1]
+            embeddings2 = [emb_dict[sent] for sent in self.sentences2]
+        except TypeError:
+            # Otherwise we just embed everything, e.g. if the sentences are images for evaluating a CLIP model
+            embeddings = model.encode(
+                self.sentences1 + self.sentences2,
+                batch_size=self.batch_size,
+                show_progress_bar=self.show_progress_bar,
+                convert_to_numpy=True,
+            )
+            embeddings1 = embeddings[: len(self.sentences1)]
+            embeddings2 = embeddings[len(self.sentences1) :]
+
+        cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
+        manhattan_distances = paired_manhattan_distances(embeddings1, embeddings2)
+        euclidean_distances = paired_euclidean_distances(embeddings1, embeddings2)
+
+        embeddings1_np = np.asarray(embeddings1)
+        embeddings2_np = np.asarray(embeddings2)
+        dot_scores = [np.dot(embeddings1_np[i], embeddings2_np[i]) for i in range(len(embeddings1_np))]
+
+        labels = np.asarray(self.labels)
+        output_scores = {}
+        for short_name, name, scores, reverse in [
+            ["cossim", "Cosine-Similarity", cosine_scores, True],
+            ["manhattan", "Manhattan-Distance", manhattan_distances, False],
+            ["euclidean", "Euclidean-Distance", euclidean_distances, False],
+            ["dot", "Dot-Product", dot_scores, True],
+        ]:
+            acc, acc_threshold = self.find_best_acc_and_threshold(scores, labels, reverse)
+            f1, precision, recall, f1_threshold = self.find_best_f1_and_threshold(scores, labels, reverse)
+            ap = average_precision_score(labels, scores * (1 if reverse else -1))
+
+            logger.info(
+                "Accuracy with {}:           {:.2f}\t(Threshold: {:.4f})".format(name, acc * 100, acc_threshold)
+            )
+            logger.info("F1 with {}:                 {:.2f}\t(Threshold: {:.4f})".format(name, f1 * 100, f1_threshold))
+            logger.info("Precision with {}:          {:.2f}".format(name, precision * 100))
+            logger.info("Recall with {}:             {:.2f}".format(name, recall * 100))
+            logger.info("Average Precision with {}:  {:.2f}\n".format(name, ap * 100))
+
+            output_scores[short_name] = {
+                "accuracy": acc,
+                "accuracy_threshold": acc_threshold,
+                "f1": f1,
+                "f1_threshold": f1_threshold,
+                "precision": precision,
+                "recall": recall,
+                "ap": ap,
+            }
+
+        return output_scores
+
+    @staticmethod
+    def find_best_acc_and_threshold(scores, labels, high_score_more_similar: bool):
+        assert len(scores) == len(labels)
+        rows = list(zip(scores, labels))
+
+        rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
+
+        max_acc = 0
+        best_threshold = -1
+
+        positive_so_far = 0
+        remaining_negatives = sum(labels == 0)
+
+        for i in range(len(rows) - 1):
+            score, label = rows[i]
+            if label == 1:
+                positive_so_far += 1
+            else:
+                remaining_negatives -= 1
+
+            acc = (positive_so_far + remaining_negatives) / len(labels)
+            if acc > max_acc:
+                max_acc = acc
+                best_threshold = (rows[i][0] + rows[i + 1][0]) / 2
+
+        return max_acc, best_threshold
+
+    @staticmethod
+    def find_best_f1_and_threshold(scores, labels, high_score_more_similar: bool):
+        assert len(scores) == len(labels)
+
+        scores = np.asarray(scores)
+        labels = np.asarray(labels)
+
+        rows = list(zip(scores, labels))
+
+        rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
+
+        best_f1 = best_precision = best_recall = 0
+        threshold = 0
+        nextract = 0
+        ncorrect = 0
+        total_num_duplicates = sum(labels)
+
+        for i in range(len(rows) - 1):
+            score, label = rows[i]
+            nextract += 1
+
+            if label == 1:
+                ncorrect += 1
+
+            if ncorrect > 0:
+                precision = ncorrect / nextract
+                recall = ncorrect / total_num_duplicates
+                f1 = 2 * precision * recall / (precision + recall)
+                if f1 > best_f1:
+                    best_f1 = f1
+                    best_precision = precision
+                    best_recall = recall
+                    threshold = (rows[i][0] + rows[i + 1][0]) / 2
+
+        return best_f1, best_precision, best_recall, threshold
--- a/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py
+++ b/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py
+from contextlib import nullcontext
+from . import SentenceEvaluator, SimilarityFunction
+import logging
+import os
+import csv
+from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
+from scipy.stats import pearsonr, spearmanr
+import numpy as np
+from typing import List, Literal, Optional
+from ..readers import InputExample
+
+
+logger = logging.getLogger(__name__)
+
+
+class EmbeddingSimilarityEvaluator(SentenceEvaluator):
+    """
+    Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation
+    in comparison to the gold standard labels.
+    The metrics are the cosine similarity as well as euclidean and Manhattan distance
+    The returned score is the Spearman correlation with a specified metric.
+
+    The results are written in a CSV. If a CSV already exists, then values are appended.
+    """
+
+    def __init__(
+        self,
+        sentences1: List[str],
+        sentences2: List[str],
+        scores: List[float],
+        batch_size: int = 16,
+        main_similarity: SimilarityFunction = None,
+        name: str = "",
+        show_progress_bar: bool = False,
+        write_csv: bool = True,
+        precision: Optional[Literal["float32", "int8", "uint8", "binary", "ubinary"]] = None,
+        truncate_dim: Optional[int] = None,
+    ):
+        """
+        Constructs an evaluator based for the dataset
+
+        The labels need to indicate the similarity between the sentences.
+
+        :param sentences1:  List with the first sentence in a pair
+        :param sentences2: List with the second sentence in a pair
+        :param scores: Similarity score between sentences1[i] and sentences2[i]
+        :param write_csv: Write results to a CSV file
+        :param precision: The precision to use for the embeddings. Can be "float32", "int8", "uint8", "binary", or
+            "ubinary". Defaults to None.
+        :param truncate_dim: The dimension to truncate sentence embeddings to. `None` uses the model's current
+            truncation dimension. Defaults to None.
+        """
+        self.sentences1 = sentences1
+        self.sentences2 = sentences2
+        self.scores = scores
+        self.write_csv = write_csv
+        self.precision = precision
+        self.truncate_dim = truncate_dim
+
+        assert len(self.sentences1) == len(self.sentences2)
+        assert len(self.sentences1) == len(self.scores)
+
+        self.main_similarity = main_similarity
+        self.name = name
+
+        self.batch_size = batch_size
+        if show_progress_bar is None:
+            show_progress_bar = (
+                logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
+            )
+        self.show_progress_bar = show_progress_bar
+
+        self.csv_file = (
+            "similarity_evaluation"
+            + ("_" + name if name else "")
+            + ("_" + precision if precision else "")
+            + "_results.csv"
+        )
+        self.csv_headers = [
+            "epoch",
+            "steps",
+            "cosine_pearson",
+            "cosine_spearman",
+            "euclidean_pearson",
+            "euclidean_spearman",
+            "manhattan_pearson",
+            "manhattan_spearman",
+            "dot_pearson",
+            "dot_spearman",
+        ]
+
+    @classmethod
+    def from_input_examples(cls, examples: List[InputExample], **kwargs):
+        sentences1 = []
+        sentences2 = []
+        scores = []
+
+        for example in examples:
+            sentences1.append(example.texts[0])
+            sentences2.append(example.texts[1])
+            scores.append(example.label)
+        return cls(sentences1, sentences2, scores, **kwargs)
+
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        if epoch != -1:
+            if steps == -1:
+                out_txt = " after epoch {}:".format(epoch)
+            else:
+                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
+        else:
+            out_txt = ":"
+
+        logger.info("EmbeddingSimilarityEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
+
+        with nullcontext() if self.truncate_dim is None else model.truncate_sentence_embeddings(self.truncate_dim):
+            embeddings1 = model.encode(
+                self.sentences1,
+                batch_size=self.batch_size,
+                show_progress_bar=self.show_progress_bar,
+                convert_to_numpy=True,
+                precision=self.precision,
+                normalize_embeddings=bool(self.precision),
+            )
+            embeddings2 = model.encode(
+                self.sentences2,
+                batch_size=self.batch_size,
+                show_progress_bar=self.show_progress_bar,
+                convert_to_numpy=True,
+                precision=self.precision,
+                normalize_embeddings=bool(self.precision),
+            )
+        # Binary and ubinary embeddings are packed, so we need to unpack them for the distance metrics
+        if self.precision == "binary":
+            embeddings1 = (embeddings1 + 128).astype(np.uint8)
+            embeddings2 = (embeddings2 + 128).astype(np.uint8)
+        if self.precision in ("ubinary", "binary"):
+            embeddings1 = np.unpackbits(embeddings1, axis=1)
+            embeddings2 = np.unpackbits(embeddings2, axis=1)
+
+        labels = self.scores
+
+        cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
+        manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
+        euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
+        dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]
+
+        eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
+        eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
+
+        eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
+        eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
+
+        eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
+        eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
+
+        eval_pearson_dot, _ = pearsonr(labels, dot_products)
+        eval_spearman_dot, _ = spearmanr(labels, dot_products)
+
+        logger.info(
+            "Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format(eval_pearson_cosine, eval_spearman_cosine)
+        )
+        logger.info(
+            "Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
+                eval_pearson_manhattan, eval_spearman_manhattan
+            )
+        )
+        logger.info(
+            "Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
+                eval_pearson_euclidean, eval_spearman_euclidean
+            )
+        )
+        logger.info(
+            "Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format(eval_pearson_dot, eval_spearman_dot)
+        )
+
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            output_file_exists = os.path.isfile(csv_path)
+            with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
+                writer = csv.writer(f)
+                if not output_file_exists:
+                    writer.writerow(self.csv_headers)
+
+                writer.writerow(
+                    [
+                        epoch,
+                        steps,
+                        eval_pearson_cosine,
+                        eval_spearman_cosine,
+                        eval_pearson_euclidean,
+                        eval_spearman_euclidean,
+                        eval_pearson_manhattan,
+                        eval_spearman_manhattan,
+                        eval_pearson_dot,
+                        eval_spearman_dot,
+                    ]
+                )
+
+        if self.main_similarity == SimilarityFunction.COSINE:
+            return eval_spearman_cosine
+        elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
+            return eval_spearman_euclidean
+        elif self.main_similarity == SimilarityFunction.MANHATTAN:
+            return eval_spearman_manhattan
+        elif self.main_similarity == SimilarityFunction.DOT_PRODUCT:
+            return eval_spearman_dot
+        elif self.main_similarity is None:
+            return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot)
+        else:
+            raise ValueError("Unknown main_similarity value")
--- a/sentence_transformers/evaluation/InformationRetrievalEvaluator.py
+++ b/sentence_transformers/evaluation/InformationRetrievalEvaluator.py
+from . import SentenceEvaluator
+import torch
+from torch import Tensor
+import logging
+from tqdm import trange
+from ..util import cos_sim, dot_score
+import os
+import numpy as np
+from typing import List, Dict, Set, Callable
+import heapq
+
+
+logger = logging.getLogger(__name__)
+
+
+class InformationRetrievalEvaluator(SentenceEvaluator):
+    """
+    This class evaluates an Information Retrieval (IR) setting.
+
+    Given a set of queries and a large corpus set. It will retrieve for each query the top-k most similar document. It measures
+    Mean Reciprocal Rank (MRR), Recall@k, and Normalized Discounted Cumulative Gain (NDCG)
+    """
+
+    def __init__(
+        self,
+        queries: Dict[str, str],  # qid => query
+        corpus: Dict[str, str],  # cid => doc
+        relevant_docs: Dict[str, Set[str]],  # qid => Set[cid]
+        corpus_chunk_size: int = 50000,
+        mrr_at_k: List[int] = [10],
+        ndcg_at_k: List[int] = [10],
+        accuracy_at_k: List[int] = [1, 3, 5, 10],
+        precision_recall_at_k: List[int] = [1, 3, 5, 10],
+        map_at_k: List[int] = [100],
+        show_progress_bar: bool = False,
+        batch_size: int = 32,
+        name: str = "",
+        write_csv: bool = True,
+        score_functions: Dict[str, Callable[[Tensor, Tensor], Tensor]] = {
+            "cos_sim": cos_sim,
+            "dot_score": dot_score,
+        },  # Score function, higher=more similar
+        main_score_function: str = None,
+    ):
+        self.queries_ids = []
+        for qid in queries:
+            if qid in relevant_docs and len(relevant_docs[qid]) > 0:
+                self.queries_ids.append(qid)
+
+        self.queries = [queries[qid] for qid in self.queries_ids]
+
+        self.corpus_ids = list(corpus.keys())
+        self.corpus = [corpus[cid] for cid in self.corpus_ids]
+
+        self.relevant_docs = relevant_docs
+        self.corpus_chunk_size = corpus_chunk_size
+        self.mrr_at_k = mrr_at_k
+        self.ndcg_at_k = ndcg_at_k
+        self.accuracy_at_k = accuracy_at_k
+        self.precision_recall_at_k = precision_recall_at_k
+        self.map_at_k = map_at_k
+
+        self.show_progress_bar = show_progress_bar
+        self.batch_size = batch_size
+        self.name = name
+        self.write_csv = write_csv
+        self.score_functions = score_functions
+        self.score_function_names = sorted(list(self.score_functions.keys()))
+        self.main_score_function = main_score_function
+
+        if name:
+            name = "_" + name
+
+        self.csv_file: str = "Information-Retrieval_evaluation" + name + "_results.csv"
+        self.csv_headers = ["epoch", "steps"]
+
+        for score_name in self.score_function_names:
+            for k in accuracy_at_k:
+                self.csv_headers.append("{}-Accuracy@{}".format(score_name, k))
+
+            for k in precision_recall_at_k:
+                self.csv_headers.append("{}-Precision@{}".format(score_name, k))
+                self.csv_headers.append("{}-Recall@{}".format(score_name, k))
+
+            for k in mrr_at_k:
+                self.csv_headers.append("{}-MRR@{}".format(score_name, k))
+
+            for k in ndcg_at_k:
+                self.csv_headers.append("{}-NDCG@{}".format(score_name, k))
+
+            for k in map_at_k:
+                self.csv_headers.append("{}-MAP@{}".format(score_name, k))
+
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1, *args, **kwargs) -> float:
+        if epoch != -1:
+            out_txt = (
+                " after epoch {}:".format(epoch)
+                if steps == -1
+                else " in epoch {} after {} steps:".format(epoch, steps)
+            )
+        else:
+            out_txt = ":"
+
+        logger.info("Information Retrieval Evaluation on " + self.name + " dataset" + out_txt)
+
+        scores = self.compute_metrices(model, *args, **kwargs)
+
+        # Write results to disc
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            if not os.path.isfile(csv_path):
+                fOut = open(csv_path, mode="w", encoding="utf-8")
+                fOut.write(",".join(self.csv_headers))
+                fOut.write("\n")
+
+            else:
+                fOut = open(csv_path, mode="a", encoding="utf-8")
+
+            output_data = [epoch, steps]
+            for name in self.score_function_names:
+                for k in self.accuracy_at_k:
+                    output_data.append(scores[name]["accuracy@k"][k])
+
+                for k in self.precision_recall_at_k:
+                    output_data.append(scores[name]["precision@k"][k])
+                    output_data.append(scores[name]["recall@k"][k])
+
+                for k in self.mrr_at_k:
+                    output_data.append(scores[name]["mrr@k"][k])
+
+                for k in self.ndcg_at_k:
+                    output_data.append(scores[name]["ndcg@k"][k])
+
+                for k in self.map_at_k:
+                    output_data.append(scores[name]["map@k"][k])
+
+            fOut.write(",".join(map(str, output_data)))
+            fOut.write("\n")
+            fOut.close()
+
+        if self.main_score_function is None:
+            return max([scores[name]["map@k"][max(self.map_at_k)] for name in self.score_function_names])
+        else:
+            return scores[self.main_score_function]["map@k"][max(self.map_at_k)]
+
+    def compute_metrices(self, model, corpus_model=None, corpus_embeddings: Tensor = None) -> Dict[str, float]:
+        if corpus_model is None:
+            corpus_model = model
+
+        max_k = max(
+            max(self.mrr_at_k),
+            max(self.ndcg_at_k),
+            max(self.accuracy_at_k),
+            max(self.precision_recall_at_k),
+            max(self.map_at_k),
+        )
+
+        # Compute embedding for the queries
+        query_embeddings = model.encode(
+            self.queries, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_tensor=True
+        )
+
+        queries_result_list = {}
+        for name in self.score_functions:
+            queries_result_list[name] = [[] for _ in range(len(query_embeddings))]
+
+        # Iterate over chunks of the corpus
+        for corpus_start_idx in trange(
+            0, len(self.corpus), self.corpus_chunk_size, desc="Corpus Chunks", disable=not self.show_progress_bar
+        ):
+            corpus_end_idx = min(corpus_start_idx + self.corpus_chunk_size, len(self.corpus))
+
+            # Encode chunk of corpus
+            if corpus_embeddings is None:
+                sub_corpus_embeddings = corpus_model.encode(
+                    self.corpus[corpus_start_idx:corpus_end_idx],
+                    show_progress_bar=False,
+                    batch_size=self.batch_size,
+                    convert_to_tensor=True,
+                )
+            else:
+                sub_corpus_embeddings = corpus_embeddings[corpus_start_idx:corpus_end_idx]
+
+            # Compute cosine similarites
+            for name, score_function in self.score_functions.items():
+                pair_scores = score_function(query_embeddings, sub_corpus_embeddings)
+
+                # Get top-k values
+                pair_scores_top_k_values, pair_scores_top_k_idx = torch.topk(
+                    pair_scores, min(max_k, len(pair_scores[0])), dim=1, largest=True, sorted=False
+                )
+                pair_scores_top_k_values = pair_scores_top_k_values.cpu().tolist()
+                pair_scores_top_k_idx = pair_scores_top_k_idx.cpu().tolist()
+
+                for query_itr in range(len(query_embeddings)):
+                    for sub_corpus_id, score in zip(
+                        pair_scores_top_k_idx[query_itr], pair_scores_top_k_values[query_itr]
+                    ):
+                        corpus_id = self.corpus_ids[corpus_start_idx + sub_corpus_id]
+                        if len(queries_result_list[name][query_itr]) < max_k:
+                            heapq.heappush(
+                                queries_result_list[name][query_itr], (score, corpus_id)
+                            )  # heaqp tracks the quantity of the first element in the tuple
+                        else:
+                            heapq.heappushpop(queries_result_list[name][query_itr], (score, corpus_id))
+
+        for name in queries_result_list:
+            for query_itr in range(len(queries_result_list[name])):
+                for doc_itr in range(len(queries_result_list[name][query_itr])):
+                    score, corpus_id = queries_result_list[name][query_itr][doc_itr]
+                    queries_result_list[name][query_itr][doc_itr] = {"corpus_id": corpus_id, "score": score}
+
+        logger.info("Queries: {}".format(len(self.queries)))
+        logger.info("Corpus: {}\n".format(len(self.corpus)))
+
+        # Compute scores
+        scores = {name: self.compute_metrics(queries_result_list[name]) for name in self.score_functions}
+
+        # Output
+        for name in self.score_function_names:
+            logger.info("Score-Function: {}".format(name))
+            self.output_scores(scores[name])
+
+        return scores
+
+    def compute_metrics(self, queries_result_list: List[object]):
+        # Init score computation values
+        num_hits_at_k = {k: 0 for k in self.accuracy_at_k}
+        precisions_at_k = {k: [] for k in self.precision_recall_at_k}
+        recall_at_k = {k: [] for k in self.precision_recall_at_k}
+        MRR = {k: 0 for k in self.mrr_at_k}
+        ndcg = {k: [] for k in self.ndcg_at_k}
+        AveP_at_k = {k: [] for k in self.map_at_k}
+
+        # Compute scores on results
+        for query_itr in range(len(queries_result_list)):
+            query_id = self.queries_ids[query_itr]
+
+            # Sort scores
+            top_hits = sorted(queries_result_list[query_itr], key=lambda x: x["score"], reverse=True)
+            query_relevant_docs = self.relevant_docs[query_id]
+
+            # Accuracy@k - We count the result correct, if at least one relevant doc is across the top-k documents
+            for k_val in self.accuracy_at_k:
+                for hit in top_hits[0:k_val]:
+                    if hit["corpus_id"] in query_relevant_docs:
+                        num_hits_at_k[k_val] += 1
+                        break
+
+            # Precision and Recall@k
+            for k_val in self.precision_recall_at_k:
+                num_correct = 0
+                for hit in top_hits[0:k_val]:
+                    if hit["corpus_id"] in query_relevant_docs:
+                        num_correct += 1
+
+                precisions_at_k[k_val].append(num_correct / k_val)
+                recall_at_k[k_val].append(num_correct / len(query_relevant_docs))
+
+            # MRR@k
+            for k_val in self.mrr_at_k:
+                for rank, hit in enumerate(top_hits[0:k_val]):
+                    if hit["corpus_id"] in query_relevant_docs:
+                        MRR[k_val] += 1.0 / (rank + 1)
+                        break
+
+            # NDCG@k
+            for k_val in self.ndcg_at_k:
+                predicted_relevance = [
+                    1 if top_hit["corpus_id"] in query_relevant_docs else 0 for top_hit in top_hits[0:k_val]
+                ]
+                true_relevances = [1] * len(query_relevant_docs)
+
+                ndcg_value = self.compute_dcg_at_k(predicted_relevance, k_val) / self.compute_dcg_at_k(
+                    true_relevances, k_val
+                )
+                ndcg[k_val].append(ndcg_value)
+
+            # MAP@k
+            for k_val in self.map_at_k:
+                num_correct = 0
+                sum_precisions = 0
+
+                for rank, hit in enumerate(top_hits[0:k_val]):
+                    if hit["corpus_id"] in query_relevant_docs:
+                        num_correct += 1
+                        sum_precisions += num_correct / (rank + 1)
+
+                avg_precision = sum_precisions / min(k_val, len(query_relevant_docs))
+                AveP_at_k[k_val].append(avg_precision)
+
+        # Compute averages
+        for k in num_hits_at_k:
+            num_hits_at_k[k] /= len(self.queries)
+
+        for k in precisions_at_k:
+            precisions_at_k[k] = np.mean(precisions_at_k[k])
+
+        for k in recall_at_k:
+            recall_at_k[k] = np.mean(recall_at_k[k])
+
+        for k in ndcg:
+            ndcg[k] = np.mean(ndcg[k])
+
+        for k in MRR:
+            MRR[k] /= len(self.queries)
+
+        for k in AveP_at_k:
+            AveP_at_k[k] = np.mean(AveP_at_k[k])
+
+        return {
+            "accuracy@k": num_hits_at_k,
+            "precision@k": precisions_at_k,
+            "recall@k": recall_at_k,
+            "ndcg@k": ndcg,
+            "mrr@k": MRR,
+            "map@k": AveP_at_k,
+        }
+
+    def output_scores(self, scores):
+        for k in scores["accuracy@k"]:
+            logger.info("Accuracy@{}: {:.2f}%".format(k, scores["accuracy@k"][k] * 100))
+
+        for k in scores["precision@k"]:
+            logger.info("Precision@{}: {:.2f}%".format(k, scores["precision@k"][k] * 100))
+
+        for k in scores["recall@k"]:
+            logger.info("Recall@{}: {:.2f}%".format(k, scores["recall@k"][k] * 100))
+
+        for k in scores["mrr@k"]:
+            logger.info("MRR@{}: {:.4f}".format(k, scores["mrr@k"][k]))
+
+        for k in scores["ndcg@k"]:
+            logger.info("NDCG@{}: {:.4f}".format(k, scores["ndcg@k"][k]))
+
+        for k in scores["map@k"]:
+            logger.info("MAP@{}: {:.4f}".format(k, scores["map@k"][k]))
+
+    @staticmethod
+    def compute_dcg_at_k(relevances, k):
+        dcg = 0
+        for i in range(min(len(relevances), k)):
+            dcg += relevances[i] / np.log2(i + 2)  # +2 as we start our idx at 0
+        return dcg
--- a/sentence_transformers/evaluation/LabelAccuracyEvaluator.py
+++ b/sentence_transformers/evaluation/LabelAccuracyEvaluator.py
+from . import SentenceEvaluator
+import torch
+from torch.utils.data import DataLoader
+import logging
+from ..util import batch_to_device
+import os
+import csv
+
+
+logger = logging.getLogger(__name__)
+
+
+class LabelAccuracyEvaluator(SentenceEvaluator):
+    """
+    Evaluate a model based on its accuracy on a labeled dataset
+
+    This requires a model with LossFunction.SOFTMAX
+
+    The results are written in a CSV. If a CSV already exists, then values are appended.
+    """
+
+    def __init__(self, dataloader: DataLoader, name: str = "", softmax_model=None, write_csv: bool = True):
+        """
+        Constructs an evaluator for the given dataset
+
+        :param dataloader:
+            the data for the evaluation
+        """
+        self.dataloader = dataloader
+        self.name = name
+        self.softmax_model = softmax_model
+
+        if name:
+            name = "_" + name
+
+        self.write_csv = write_csv
+        self.csv_file = "accuracy_evaluation" + name + "_results.csv"
+        self.csv_headers = ["epoch", "steps", "accuracy"]
+
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        model.eval()
+        total = 0
+        correct = 0
+
+        if epoch != -1:
+            if steps == -1:
+                out_txt = " after epoch {}:".format(epoch)
+            else:
+                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
+        else:
+            out_txt = ":"
+
+        logger.info("Evaluation on the " + self.name + " dataset" + out_txt)
+        self.dataloader.collate_fn = model.smart_batching_collate
+        for step, batch in enumerate(self.dataloader):
+            features, label_ids = batch
+            for idx in range(len(features)):
+                features[idx] = batch_to_device(features[idx], model.device)
+            label_ids = label_ids.to(model.device)
+            with torch.no_grad():
+                _, prediction = self.softmax_model(features, labels=None)
+
+            total += prediction.size(0)
+            correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item()
+        accuracy = correct / total
+
+        logger.info("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total))
+
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            if not os.path.isfile(csv_path):
+                with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
+                    writer = csv.writer(f)
+                    writer.writerow(self.csv_headers)
+                    writer.writerow([epoch, steps, accuracy])
+            else:
+                with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
+                    writer = csv.writer(f)
+                    writer.writerow([epoch, steps, accuracy])
+
+        return accuracy
--- a/sentence_transformers/evaluation/MSEEvaluator.py
+++ b/sentence_transformers/evaluation/MSEEvaluator.py
+from sentence_transformers.evaluation import SentenceEvaluator
+import logging
+import os
+import csv
+from typing import List
+
+
+logger = logging.getLogger(__name__)
+
+
+class MSEEvaluator(SentenceEvaluator):
+    """
+    Computes the mean squared error (x100) between the computed sentence embedding
+    and some target sentence embedding.
+
+    The MSE is computed between ||teacher.encode(source_sentences) - student.encode(target_sentences)||.
+
+    For multilingual knowledge distillation (https://arxiv.org/abs/2004.09813), source_sentences are in English
+    and target_sentences are in a different language like German, Chinese, Spanish...
+
+    :param source_sentences: Source sentences are embedded with the teacher model
+    :param target_sentences: Target sentences are ambedding with the student model.
+    :param show_progress_bar: Show progress bar when computing embeddings
+    :param batch_size: Batch size to compute sentence embeddings
+    :param name: Name of the evaluator
+    :param write_csv: Write results to CSV file
+    """
+
+    def __init__(
+        self,
+        source_sentences: List[str],
+        target_sentences: List[str],
+        teacher_model=None,
+        show_progress_bar: bool = False,
+        batch_size: int = 32,
+        name: str = "",
+        write_csv: bool = True,
+    ):
+        self.source_embeddings = teacher_model.encode(
+            source_sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_numpy=True
+        )
+
+        self.target_sentences = target_sentences
+        self.show_progress_bar = show_progress_bar
+        self.batch_size = batch_size
+        self.name = name
+
+        self.csv_file = "mse_evaluation_" + name + "_results.csv"
+        self.csv_headers = ["epoch", "steps", "MSE"]
+        self.write_csv = write_csv
+
+    def __call__(self, model, output_path, epoch=-1, steps=-1):
+        if epoch != -1:
+            if steps == -1:
+                out_txt = " after epoch {}:".format(epoch)
+            else:
+                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
+        else:
+            out_txt = ":"
+
+        target_embeddings = model.encode(
+            self.target_sentences,
+            show_progress_bar=self.show_progress_bar,
+            batch_size=self.batch_size,
+            convert_to_numpy=True,
+        )
+
+        mse = ((self.source_embeddings - target_embeddings) ** 2).mean()
+        mse *= 100
+
+        logger.info("MSE evaluation (lower = better) on " + self.name + " dataset" + out_txt)
+        logger.info("MSE (*100):\t{:4f}".format(mse))
+
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            output_file_exists = os.path.isfile(csv_path)
+            with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
+                writer = csv.writer(f)
+                if not output_file_exists:
+                    writer.writerow(self.csv_headers)
+
+                writer.writerow([epoch, steps, mse])
+
+        return -mse  # Return negative score as SentenceTransformers maximizes the performance
--- a/sentence_transformers/evaluation/MSEEvaluatorFromDataFrame.py
+++ b/sentence_transformers/evaluation/MSEEvaluatorFromDataFrame.py
+from sentence_transformers.evaluation import SentenceEvaluator
+from sentence_transformers import SentenceTransformer
+from typing import List, Tuple, Dict
+import numpy as np
+import logging
+import os
+import csv
+
+
+logger = logging.getLogger(__name__)
+
+
+class MSEEvaluatorFromDataFrame(SentenceEvaluator):
+    """
+    Computes the mean squared error (x100) between the computed sentence embedding and some target sentence embedding.
+
+    :param dataframe: It must have the following format. Rows contains different, parallel sentences.
+        Columns are the respective language codes::
+
+            [{'en': 'My sentence', 'es': 'Sentence in Spanisch', 'fr': 'Sentence in French'...},
+             {'en': 'My second sentence', ...}]
+    :param combinations: Must be of the format ``[('en', 'es'), ('en', 'fr'), ...]``.
+        First entry in a tuple is the source language. The sentence in the respective language will be fetched from
+        the dataframe and passed to the teacher model. Second entry in a tuple the the target language. Sentence
+        will be fetched from the dataframe and passed to the student model
+    """
+
+    def __init__(
+        self,
+        dataframe: List[Dict[str, str]],
+        teacher_model: SentenceTransformer,
+        combinations: List[Tuple[str, str]],
+        batch_size: int = 8,
+        name="",
+        write_csv: bool = True,
+    ):
+        self.combinations = combinations
+        self.name = name
+        self.batch_size = batch_size
+
+        if name:
+            name = "_" + name
+
+        self.csv_file = "mse_evaluation" + name + "_results.csv"
+        self.csv_headers = ["epoch", "steps"]
+        self.write_csv = write_csv
+        self.data = {}
+
+        logger.info("Compute teacher embeddings")
+        all_source_sentences = set()
+        for src_lang, trg_lang in self.combinations:
+            src_sentences = []
+            trg_sentences = []
+
+            for row in dataframe:
+                if row[src_lang].strip() != "" and row[trg_lang].strip() != "":
+                    all_source_sentences.add(row[src_lang])
+                    src_sentences.append(row[src_lang])
+                    trg_sentences.append(row[trg_lang])
+
+            self.data[(src_lang, trg_lang)] = (src_sentences, trg_sentences)
+            self.csv_headers.append("{}-{}".format(src_lang, trg_lang))
+
+        all_source_sentences = list(all_source_sentences)
+        all_src_embeddings = teacher_model.encode(all_source_sentences, batch_size=self.batch_size)
+        self.teacher_embeddings = {sent: emb for sent, emb in zip(all_source_sentences, all_src_embeddings)}
+
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1):
+        model.eval()
+
+        mse_scores = []
+        for src_lang, trg_lang in self.combinations:
+            src_sentences, trg_sentences = self.data[(src_lang, trg_lang)]
+
+            src_embeddings = np.asarray([self.teacher_embeddings[sent] for sent in src_sentences])
+            trg_embeddings = np.asarray(model.encode(trg_sentences, batch_size=self.batch_size))
+
+            mse = ((src_embeddings - trg_embeddings) ** 2).mean()
+            mse *= 100
+            mse_scores.append(mse)
+
+            logger.info("MSE evaluation on {} dataset - {}-{}:".format(self.name, src_lang, trg_lang))
+            logger.info("MSE (*100):\t{:4f}".format(mse))
+
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            output_file_exists = os.path.isfile(csv_path)
+            with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
+                writer = csv.writer(f)
+                if not output_file_exists:
+                    writer.writerow(self.csv_headers)
+
+                writer.writerow([epoch, steps] + mse_scores)
+
+        return -np.mean(mse_scores)  # Return negative score as SentenceTransformers maximizes the performance
--- a/sentence_transformers/evaluation/ParaphraseMiningEvaluator.py
+++ b/sentence_transformers/evaluation/ParaphraseMiningEvaluator.py
+from . import SentenceEvaluator
+import logging
+from sentence_transformers.util import paraphrase_mining
+import os
+import csv
+
+from typing import List, Tuple, Dict
+from collections import defaultdict
+
+
+logger = logging.getLogger(__name__)
+
+
+class ParaphraseMiningEvaluator(SentenceEvaluator):
+    """
+    Given a large set of sentences, this evaluator performs paraphrase (duplicate) mining and
+    identifies the pairs with the highest similarity. It compare the extracted paraphrase pairs
+    with a set of gold labels and computes the F1 score.
+    """
+
+    def __init__(
+        self,
+        sentences_map: Dict[str, str],
+        duplicates_list: List[Tuple[str, str]] = None,
+        duplicates_dict: Dict[str, Dict[str, bool]] = None,
+        add_transitive_closure: bool = False,
+        query_chunk_size: int = 5000,
+        corpus_chunk_size: int = 100000,
+        max_pairs: int = 500000,
+        top_k: int = 100,
+        show_progress_bar: bool = False,
+        batch_size: int = 16,
+        name: str = "",
+        write_csv: bool = True,
+    ):
+        """
+
+        :param sentences_map: A dictionary that maps sentence-ids to sentences, i.e. sentences_map[id] => sentence.
+        :param duplicates_list: Duplicates_list is a list with id pairs [(id1, id2), (id1, id5)] that identifies the duplicates / paraphrases in the sentences_map
+        :param duplicates_dict: A default dictionary mapping [id1][id2] to true if id1 and id2 are duplicates. Must be symmetric, i.e., if [id1][id2] => True, then [id2][id1] => True.
+        :param add_transitive_closure: If true, it adds a transitive closure, i.e. if dup[a][b] and dup[b][c], then dup[a][c]
+        :param query_chunk_size: To identify the paraphrases, the cosine-similarity between all sentence-pairs will be computed. As this might require a lot of memory, we perform a batched computation.  #query_batch_size sentences will be compared against up to #corpus_batch_size sentences. In the default setting, 5000 sentences will be grouped together and compared up-to against 100k other sentences.
+        :param corpus_chunk_size: The corpus will be batched, to reduce the memory requirement
+        :param max_pairs: We will only extract up to #max_pairs potential paraphrase candidates.
+        :param top_k: For each query, we extract the top_k most similar pairs and add it to a sorted list. I.e., for one sentence we cannot find more than top_k paraphrases
+        :param show_progress_bar: Output a progress bar
+        :param batch_size: Batch size for computing sentence embeddings
+        :param name: Name of the experiment
+        :param write_csv: Write results to CSV file
+        """
+        self.sentences = []
+        self.ids = []
+
+        for id, sentence in sentences_map.items():
+            self.sentences.append(sentence)
+            self.ids.append(id)
+
+        self.name = name
+        self.show_progress_bar = show_progress_bar
+        self.batch_size = batch_size
+        self.query_chunk_size = query_chunk_size
+        self.corpus_chunk_size = corpus_chunk_size
+        self.max_pairs = max_pairs
+        self.top_k = top_k
+
+        self.duplicates = duplicates_dict if duplicates_dict is not None else defaultdict(lambda: defaultdict(bool))
+        if duplicates_list is not None:
+            for id1, id2 in duplicates_list:
+                if id1 in sentences_map and id2 in sentences_map:
+                    self.duplicates[id1][id2] = True
+                    self.duplicates[id2][id1] = True
+
+        # Add transitive closure
+        if add_transitive_closure:
+            self.duplicates = self.add_transitive_closure(self.duplicates)
+
+        positive_key_pairs = set()
+        for key1 in self.duplicates:
+            for key2 in self.duplicates[key1]:
+                if (
+                    key1 in sentences_map
+                    and key2 in sentences_map
+                    and (self.duplicates[key1][key2] or self.duplicates[key2][key1])
+                ):
+                    positive_key_pairs.add(tuple(sorted([key1, key2])))
+
+        self.total_num_duplicates = len(positive_key_pairs)
+
+        if name:
+            name = "_" + name
+
+        self.csv_file: str = "paraphrase_mining_evaluation" + name + "_results.csv"
+        self.csv_headers = ["epoch", "steps", "precision", "recall", "f1", "threshold", "average_precision"]
+        self.write_csv = write_csv
+
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        if epoch != -1:
+            out_txt = f" after epoch {epoch}:" if steps == -1 else f" in epoch {epoch} after {steps} steps:"
+        else:
+            out_txt = ":"
+
+        logger.info("Paraphrase Mining Evaluation on " + self.name + " dataset" + out_txt)
+
+        # Compute embedding for the sentences
+        pairs_list = paraphrase_mining(
+            model,
+            self.sentences,
+            self.show_progress_bar,
+            self.batch_size,
+            self.query_chunk_size,
+            self.corpus_chunk_size,
+            self.max_pairs,
+            self.top_k,
+        )
+
+        logger.info("Number of candidate pairs: " + str(len(pairs_list)))
+
+        # Compute F1 score and Average Precision
+        n_extract = n_correct = 0
+        threshold = 0
+        best_f1 = best_recall = best_precision = 0
+
+        average_precision = 0
+
+        for idx in range(len(pairs_list)):
+            score, i, j = pairs_list[idx]
+            id1 = self.ids[i]
+            id2 = self.ids[j]
+
+            # Compute optimal threshold and F1-score
+            n_extract += 1
+            if self.duplicates[id1][id2] or self.duplicates[id2][id1]:
+                n_correct += 1
+                precision = n_correct / n_extract
+                recall = n_correct / self.total_num_duplicates
+                f1 = 2 * precision * recall / (precision + recall)
+                average_precision += precision
+                if f1 > best_f1:
+                    best_f1 = f1
+                    best_precision = precision
+                    best_recall = recall
+                    threshold = (pairs_list[idx][0] + pairs_list[min(idx + 1, len(pairs_list) - 1)][0]) / 2
+
+        average_precision = average_precision / self.total_num_duplicates
+
+        logger.info("Average Precision: {:.2f}".format(average_precision * 100))
+        logger.info("Optimal threshold: {:.4f}".format(threshold))
+        logger.info("Precision: {:.2f}".format(best_precision * 100))
+        logger.info("Recall: {:.2f}".format(best_recall * 100))
+        logger.info("F1: {:.2f}\n".format(best_f1 * 100))
+
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            if not os.path.isfile(csv_path):
+                with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
+                    writer = csv.writer(f)
+                    writer.writerow(self.csv_headers)
+                    writer.writerow([epoch, steps, best_precision, best_recall, best_f1, threshold, average_precision])
+            else:
+                with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
+                    writer = csv.writer(f)
+                    writer.writerow([epoch, steps, best_precision, best_recall, best_f1, threshold, average_precision])
+
+        return average_precision
+
+    @staticmethod
+    def add_transitive_closure(graph):
+        nodes_visited = set()
+        for a in list(graph.keys()):
+            if a not in nodes_visited:
+                connected_subgraph_nodes = set()
+                connected_subgraph_nodes.add(a)
+
+                # Add all nodes in the connected graph
+                neighbor_nodes_queue = list(graph[a])
+                while len(neighbor_nodes_queue) > 0:
+                    node = neighbor_nodes_queue.pop(0)
+                    if node not in connected_subgraph_nodes:
+                        connected_subgraph_nodes.add(node)
+                        neighbor_nodes_queue.extend(graph[node])
+
+                # Ensure transitivity between all nodes in the graph
+                connected_subgraph_nodes = list(connected_subgraph_nodes)
+                for i in range(len(connected_subgraph_nodes) - 1):
+                    for j in range(i + 1, len(connected_subgraph_nodes)):
+                        graph[connected_subgraph_nodes[i]][connected_subgraph_nodes[j]] = True
+                        graph[connected_subgraph_nodes[j]][connected_subgraph_nodes[i]] = True
+
+                        nodes_visited.add(connected_subgraph_nodes[i])
+                        nodes_visited.add(connected_subgraph_nodes[j])
+        return graph
--- a/sentence_transformers/evaluation/RerankingEvaluator.py
+++ b/sentence_transformers/evaluation/RerankingEvaluator.py
+from . import SentenceEvaluator
+import logging
+import numpy as np
+import os
+import csv
+from ..util import cos_sim
+import torch
+from sklearn.metrics import average_precision_score, ndcg_score
+import tqdm
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+
+class RerankingEvaluator(SentenceEvaluator):
+    """
+    This class evaluates a SentenceTransformer model for the task of re-ranking.
+
+    Given a query and a list of documents, it computes the score [query, doc_i] for all possible
+    documents and sorts them in decreasing order. Then, MRR@10, NDCG@10 and MAP is compute to measure the quality of the ranking.
+
+    :param samples: Must be a list and each element is of the form: {'query': '', 'positive': [], 'negative': []}. Query is the search query,
+     positive is a list of positive (relevant) documents, negative is a list of negative (irrelevant) documents.
+    """
+
+    def __init__(
+        self,
+        samples,
+        at_k: int = 10,
+        name: str = "",
+        write_csv: bool = True,
+        similarity_fct=cos_sim,
+        batch_size: int = 64,
+        show_progress_bar: bool = False,
+        use_batched_encoding: bool = True,
+        mrr_at_k: Optional[int] = None,
+    ):
+        self.samples = samples
+        self.name = name
+        if mrr_at_k is not None:
+            logger.warning(f"The `mrr_at_k` parameter has been deprecated; please use `at_k={mrr_at_k}` instead.")
+            self.at_k = mrr_at_k
+        else:
+            self.at_k = at_k
+        self.similarity_fct = similarity_fct
+        self.batch_size = batch_size
+        self.show_progress_bar = show_progress_bar
+        self.use_batched_encoding = use_batched_encoding
+
+        if isinstance(self.samples, dict):
+            self.samples = list(self.samples.values())
+
+        ### Remove sample with empty positive / negative set
+        self.samples = [
+            sample for sample in self.samples if len(sample["positive"]) > 0 and len(sample["negative"]) > 0
+        ]
+
+        self.csv_file = "RerankingEvaluator" + ("_" + name if name else "") + f"_results_@{self.at_k}.csv"
+        self.csv_headers = [
+            "epoch",
+            "steps",
+            "MAP",
+            "MRR@{}".format(self.at_k),
+            "NDCG@{}".format(self.at_k),
+        ]
+        self.write_csv = write_csv
+
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        if epoch != -1:
+            if steps == -1:
+                out_txt = " after epoch {}:".format(epoch)
+            else:
+                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
+        else:
+            out_txt = ":"
+
+        logger.info("RerankingEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
+
+        scores = self.compute_metrices(model)
+        mean_ap = scores["map"]
+        mean_mrr = scores["mrr"]
+        mean_ndcg = scores["ndcg"]
+
+        #### Some stats about the dataset
+        num_positives = [len(sample["positive"]) for sample in self.samples]
+        num_negatives = [len(sample["negative"]) for sample in self.samples]
+
+        logger.info(
+            "Queries: {} \t Positives: Min {:.1f}, Mean {:.1f}, Max {:.1f} \t Negatives: Min {:.1f}, Mean {:.1f}, Max {:.1f}".format(
+                len(self.samples),
+                np.min(num_positives),
+                np.mean(num_positives),
+                np.max(num_positives),
+                np.min(num_negatives),
+                np.mean(num_negatives),
+                np.max(num_negatives),
+            )
+        )
+        logger.info("MAP: {:.2f}".format(mean_ap * 100))
+        logger.info("MRR@{}: {:.2f}".format(self.at_k, mean_mrr * 100))
+        logger.info("NDCG@{}: {:.2f}".format(self.at_k, mean_ndcg * 100))
+
+        #### Write results to disc
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            output_file_exists = os.path.isfile(csv_path)
+            with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
+                writer = csv.writer(f)
+                if not output_file_exists:
+                    writer.writerow(self.csv_headers)
+
+                writer.writerow([epoch, steps, mean_ap, mean_mrr, mean_ndcg])
+
+        return mean_ap
+
+    def compute_metrices(self, model):
+        return (
+            self.compute_metrices_batched(model)
+            if self.use_batched_encoding
+            else self.compute_metrices_individual(model)
+        )
+
+    def compute_metrices_batched(self, model):
+        """
+        Computes the metrices in a batched way, by batching all queries and
+        all documents together
+        """
+        all_mrr_scores = []
+        all_ndcg_scores = []
+        all_ap_scores = []
+
+        all_query_embs = model.encode(
+            [sample["query"] for sample in self.samples],
+            convert_to_tensor=True,
+            batch_size=self.batch_size,
+            show_progress_bar=self.show_progress_bar,
+        )
+
+        all_docs = []
+
+        for sample in self.samples:
+            all_docs.extend(sample["positive"])
+            all_docs.extend(sample["negative"])
+
+        all_docs_embs = model.encode(
+            all_docs, convert_to_tensor=True, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar
+        )
+
+        # Compute scores
+        query_idx, docs_idx = 0, 0
+        for instance in self.samples:
+            query_emb = all_query_embs[query_idx]
+            query_idx += 1
+
+            num_pos = len(instance["positive"])
+            num_neg = len(instance["negative"])
+            docs_emb = all_docs_embs[docs_idx : docs_idx + num_pos + num_neg]
+            docs_idx += num_pos + num_neg
+
+            if num_pos == 0 or num_neg == 0:
+                continue
+
+            pred_scores = self.similarity_fct(query_emb, docs_emb)
+            if len(pred_scores.shape) > 1:
+                pred_scores = pred_scores[0]
+
+            pred_scores_argsort = torch.argsort(-pred_scores)  # Sort in decreasing order
+            pred_scores = pred_scores.cpu().tolist()
+
+            # Compute MRR score
+            is_relevant = [1] * num_pos + [0] * num_neg
+            mrr_score = 0
+            for rank, index in enumerate(pred_scores_argsort[0 : self.at_k]):
+                if is_relevant[index]:
+                    mrr_score = 1 / (rank + 1)
+                    break
+            all_mrr_scores.append(mrr_score)
+
+            # Compute NDCG score
+            all_ndcg_scores.append(ndcg_score([is_relevant], [pred_scores], k=self.at_k))
+
+            # Compute AP
+            all_ap_scores.append(average_precision_score(is_relevant, pred_scores))
+
+        mean_ap = np.mean(all_ap_scores)
+        mean_mrr = np.mean(all_mrr_scores)
+        mean_ndcg = np.mean(all_ndcg_scores)
+
+        return {"map": mean_ap, "mrr": mean_mrr, "ndcg": mean_ndcg}
+
+    def compute_metrices_individual(self, model):
+        """
+        Embeds every (query, positive, negative) tuple individually.
+        Is slower than the batched version, but saves memory as only the
+        embeddings for one tuple are needed. Useful when you have
+        a really large test set
+        """
+        all_mrr_scores = []
+        all_ndcg_scores = []
+        all_ap_scores = []
+
+        for instance in tqdm.tqdm(self.samples, disable=not self.show_progress_bar, desc="Samples"):
+            query = instance["query"]
+            positive = list(instance["positive"])
+            negative = list(instance["negative"])
+
+            if len(positive) == 0 or len(negative) == 0:
+                continue
+
+            docs = positive + negative
+            is_relevant = [1] * len(positive) + [0] * len(negative)
+
+            query_emb = model.encode(
+                [query], convert_to_tensor=True, batch_size=self.batch_size, show_progress_bar=False
+            )
+            docs_emb = model.encode(docs, convert_to_tensor=True, batch_size=self.batch_size, show_progress_bar=False)
+
+            pred_scores = self.similarity_fct(query_emb, docs_emb)
+            if len(pred_scores.shape) > 1:
+                pred_scores = pred_scores[0]
+
+            pred_scores_argsort = torch.argsort(-pred_scores)  # Sort in decreasing order
+            pred_scores = pred_scores.cpu().tolist()
+
+            # Compute MRR score
+            mrr_score = 0
+            for rank, index in enumerate(pred_scores_argsort[0 : self.at_k]):
+                if is_relevant[index]:
+                    mrr_score = 1 / (rank + 1)
+                    break
+            all_mrr_scores.append(mrr_score)
+
+            # Compute NDCG score
+            all_ndcg_scores.append(ndcg_score([is_relevant], [pred_scores], k=self.at_k))
+
+            # Compute AP
+            all_ap_scores.append(average_precision_score(is_relevant, pred_scores))
+
+        mean_ap = np.mean(all_ap_scores)
+        mean_mrr = np.mean(all_mrr_scores)
+        mean_ndcg = np.mean(all_ndcg_scores)
+
+        return {"map": mean_ap, "mrr": mean_mrr, "ndcg": mean_ndcg}
--- a/sentence_transformers/evaluation/SentenceEvaluator.py
+++ b/sentence_transformers/evaluation/SentenceEvaluator.py
+class SentenceEvaluator:
+    """
+    Base class for all evaluators
+
+    Extend this class and implement __call__ for custom evaluators.
+    """
+
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        """
+        This is called during training to evaluate the model.
+        It returns a score for the evaluation with a higher score indicating a better result.
+
+        :param model:
+            the model to evaluate
+        :param output_path:
+            path where predictions and metrics are written to
+        :param epoch
+            the epoch where the evaluation takes place.
+            This is used for the file prefixes.
+            If this is -1, then we assume evaluation on test data.
+        :param steps
+            the steps in the current epoch at time of the evaluation.
+            This is used for the file prefixes.
+            If this is -1, then we assume evaluation at the end of the epoch.
+        :return: a score for the evaluation with a higher score indicating a better result
+        """
+        pass
--- a/sentence_transformers/evaluation/SequentialEvaluator.py
+++ b/sentence_transformers/evaluation/SequentialEvaluator.py
+from . import SentenceEvaluator
+from typing import Iterable
+
+
+class SequentialEvaluator(SentenceEvaluator):
+    """
+    This evaluator allows that multiple sub-evaluators are passed. When the model is evaluated,
+    the data is passed sequentially to all sub-evaluators.
+
+    All scores are passed to 'main_score_function', which derives one final score value
+    """
+
+    def __init__(self, evaluators: Iterable[SentenceEvaluator], main_score_function=lambda scores: scores[-1]):
+        self.evaluators = evaluators
+        self.main_score_function = main_score_function
+
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        scores = []
+        for evaluator in self.evaluators:
+            scores.append(evaluator(model, output_path, epoch, steps))
+
+        return self.main_score_function(scores)