Commit 0fccd232 authored by Rayyyyy's avatar Rayyyyy
Browse files

First add

parents
Pipeline #1027 failed with stages
in 0 seconds
import csv
import logging
import os
from typing import List
import numpy as np
from .. import CrossEncoder
from ... import InputExample
from sklearn.metrics import f1_score
logger = logging.getLogger(__name__)
class CEF1Evaluator:
"""
CrossEncoder F1 score based evaluator for binary and multiclass tasks.
The task type (binary or multiclass) is determined from the labels array. For
binary tasks the returned metric is binary F1 score. For the multiclass tasks
the returned metric is macro F1 score.
:param sentence_pairs: A list of sentence pairs, where each pair is a list of two strings.
:type sentence_pairs: list[list[str]]
:param labels: A list of integer labels corresponding to each sentence pair.
:type labels: list[int]
:param batch_size: Batch size for prediction. Defaults to 32.
:type batch_size: int
:param show_progress_bar: Show tqdm progress bar.
:type show_progress_bar: bool
:param name: An optional name for the CSV file with stored results. Defaults to an empty string.
:type name: str, optional
:param write_csv: Flag to determine if the data should be saved to a CSV file. Defaults to True.
:type write_csv: bool, optional
"""
def __init__(
self,
sentence_pairs: List[List[str]],
labels: List[int],
*,
batch_size: int = 32,
show_progress_bar: bool = False,
name: str = "",
write_csv: bool = True,
):
self.sentence_pairs = sentence_pairs
self.labels = labels
self.batch_size = batch_size
self.show_progress_bar = show_progress_bar
self.name = name
self.write_csv = write_csv
n_unique = np.unique(labels).size
if n_unique == 2:
self.f1_callables = [
("Binary F1 score", lambda x, y: f1_score(x, y, average="binary")),
]
elif n_unique > 2:
self.f1_callables = [
("Macro F1 score", lambda x, y: f1_score(x, y, average="macro")),
("Micro F1 score", lambda x, y: f1_score(x, y, average="micro")),
("Weighted F1 score", lambda x, y: f1_score(x, y, average="weighted")),
]
else:
raise ValueError(
"Got only one distinct label. Please make sure there are at least two labels in the `labels` array."
)
self.csv_file = "CEF1Evaluator" + (f"_{name}" if name else "") + "_results.csv"
self.csv_headers = ["epoch", "steps"] + [metric_name for metric_name, _ in self.f1_callables]
@classmethod
def from_input_examples(cls, examples: List[InputExample], **kwargs):
sentence_pairs = []
labels = []
for example in examples:
sentence_pairs.append(example.texts)
labels.append(example.label)
return cls(sentence_pairs, labels, **kwargs)
def __call__(
self,
model: CrossEncoder,
output_path: str = None,
epoch: int = -1,
steps: int = -1,
) -> float:
if epoch != -1:
if steps == -1:
out_txt = f"after epoch {epoch}:"
else:
out_txt = f"in epoch {epoch} after {steps} steps:"
else:
out_txt = ":"
logger.info(f"CEF1Evaluator: Evaluating the model on {self.name} dataset {out_txt}")
pred_scores = model.predict(
self.sentence_pairs,
batch_size=self.batch_size,
show_progress_bar=self.show_progress_bar,
convert_to_numpy=True,
)
pred_labels = np.argmax(pred_scores, axis=1)
assert len(pred_labels) == len(self.labels)
save_f1 = []
for f1_name, f1_fn in self.f1_callables:
f1_val = f1_fn(pred_labels, self.labels)
save_f1.append(f1_val)
logger.info(f"{f1_name:20s}: {f1_val * 100:.2f}")
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
output_file_exists = os.path.isfile(csv_path)
mode = "a" if output_file_exists else "w"
with open(csv_path, mode=mode, encoding="utf-8") as f:
writer = csv.writer(f)
if not output_file_exists:
writer.writerow(self.csv_headers)
writer.writerow([epoch, steps, *save_f1])
return save_f1[0]
import logging
import numpy as np
import os
import csv
from typing import Optional
from sklearn.metrics import ndcg_score
logger = logging.getLogger(__name__)
class CERerankingEvaluator:
"""
This class evaluates a CrossEncoder model for the task of re-ranking.
Given a query and a list of documents, it computes the score [query, doc_i] for all possible
documents and sorts them in decreasing order. Then, MRR@10 and NDCG@10 are computed to measure the quality of the ranking.
:param samples: Must be a list and each element is of the form: {'query': '', 'positive': [], 'negative': []}. Query is the search query,
positive is a list of positive (relevant) documents, negative is a list of negative (irrelevant) documents.
"""
def __init__(
self, samples, at_k: int = 10, name: str = "", write_csv: bool = True, mrr_at_k: Optional[int] = None
):
self.samples = samples
self.name = name
if mrr_at_k is not None:
logger.warning(f"The `mrr_at_k` parameter has been deprecated; please use `at_k={mrr_at_k}` instead.")
self.at_k = mrr_at_k
else:
self.at_k = at_k
if isinstance(self.samples, dict):
self.samples = list(self.samples.values())
self.csv_file = "CERerankingEvaluator" + ("_" + name if name else "") + f"_results_@{self.at_k}.csv"
self.csv_headers = [
"epoch",
"steps",
"MRR@{}".format(self.at_k),
"NDCG@{}".format(self.at_k),
]
self.write_csv = write_csv
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
if epoch != -1:
if steps == -1:
out_txt = " after epoch {}:".format(epoch)
else:
out_txt = " in epoch {} after {} steps:".format(epoch, steps)
else:
out_txt = ":"
logger.info("CERerankingEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
all_mrr_scores = []
all_ndcg_scores = []
num_queries = 0
num_positives = []
num_negatives = []
for instance in self.samples:
query = instance["query"]
positive = list(instance["positive"])
negative = list(instance["negative"])
docs = positive + negative
is_relevant = [1] * len(positive) + [0] * len(negative)
if len(positive) == 0 or len(negative) == 0:
continue
num_queries += 1
num_positives.append(len(positive))
num_negatives.append(len(negative))
model_input = [[query, doc] for doc in docs]
pred_scores = model.predict(model_input, convert_to_numpy=True, show_progress_bar=False)
pred_scores_argsort = np.argsort(-pred_scores) # Sort in decreasing order
mrr_score = 0
for rank, index in enumerate(pred_scores_argsort[0 : self.at_k]):
if is_relevant[index]:
mrr_score = 1 / (rank + 1)
break
all_mrr_scores.append(mrr_score)
all_ndcg_scores.append(ndcg_score([is_relevant], [pred_scores], k=self.at_k))
mean_mrr = np.mean(all_mrr_scores)
mean_ndcg = np.mean(all_ndcg_scores)
logger.info(
"Queries: {} \t Positives: Min {:.1f}, Mean {:.1f}, Max {:.1f} \t Negatives: Min {:.1f}, Mean {:.1f}, Max {:.1f}".format(
num_queries,
np.min(num_positives),
np.mean(num_positives),
np.max(num_positives),
np.min(num_negatives),
np.mean(num_negatives),
np.max(num_negatives),
)
)
logger.info("MRR@{}: {:.2f}".format(self.at_k, mean_mrr * 100))
logger.info("NDCG@{}: {:.2f}".format(self.at_k, mean_ndcg * 100))
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
output_file_exists = os.path.isfile(csv_path)
with open(csv_path, mode="a" if output_file_exists else "w", encoding="utf-8") as f:
writer = csv.writer(f)
if not output_file_exists:
writer.writerow(self.csv_headers)
writer.writerow([epoch, steps, mean_mrr, mean_ndcg])
return mean_mrr
import logging
import os
import csv
from typing import List
from ... import InputExample
import numpy as np
logger = logging.getLogger(__name__)
class CESoftmaxAccuracyEvaluator:
"""
This evaluator can be used with the CrossEncoder class.
It is designed for CrossEncoders with 2 or more outputs. It measure the
accuracy of the predict class vs. the gold labels.
"""
def __init__(self, sentence_pairs: List[List[str]], labels: List[int], name: str = "", write_csv: bool = True):
self.sentence_pairs = sentence_pairs
self.labels = labels
self.name = name
self.csv_file = "CESoftmaxAccuracyEvaluator" + ("_" + name if name else "") + "_results.csv"
self.csv_headers = ["epoch", "steps", "Accuracy"]
self.write_csv = write_csv
@classmethod
def from_input_examples(cls, examples: List[InputExample], **kwargs):
sentence_pairs = []
labels = []
for example in examples:
sentence_pairs.append(example.texts)
labels.append(example.label)
return cls(sentence_pairs, labels, **kwargs)
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
if epoch != -1:
if steps == -1:
out_txt = " after epoch {}:".format(epoch)
else:
out_txt = " in epoch {} after {} steps:".format(epoch, steps)
else:
out_txt = ":"
logger.info("CESoftmaxAccuracyEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False)
pred_labels = np.argmax(pred_scores, axis=1)
assert len(pred_labels) == len(self.labels)
acc = np.sum(pred_labels == self.labels) / len(self.labels)
logger.info("Accuracy: {:.2f}".format(acc * 100))
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
output_file_exists = os.path.isfile(csv_path)
with open(csv_path, mode="a" if output_file_exists else "w", encoding="utf-8") as f:
writer = csv.writer(f)
if not output_file_exists:
writer.writerow(self.csv_headers)
writer.writerow([epoch, steps, acc])
return acc
from .CEBinaryAccuracyEvaluator import CEBinaryAccuracyEvaluator
from .CEBinaryClassificationEvaluator import CEBinaryClassificationEvaluator
from .CEF1Evaluator import CEF1Evaluator
from .CECorrelationEvaluator import CECorrelationEvaluator
from .CESoftmaxAccuracyEvaluator import CESoftmaxAccuracyEvaluator
from .CERerankingEvaluator import CERerankingEvaluator
__all__ = [
"CEBinaryAccuracyEvaluator",
"CEBinaryClassificationEvaluator",
"CECorrelationEvaluator",
"CEF1Evaluator",
"CESoftmaxAccuracyEvaluator",
"CERerankingEvaluator",
]
from torch.utils.data import Dataset
from typing import List
from ..readers.InputExample import InputExample
import numpy as np
from transformers.utils.import_utils import is_nltk_available, NLTK_IMPORT_ERROR
class DenoisingAutoEncoderDataset(Dataset):
"""
The DenoisingAutoEncoderDataset returns InputExamples in the format: texts=[noise_fn(sentence), sentence]
It is used in combination with the DenoisingAutoEncoderLoss: Here, a decoder tries to re-construct the
sentence without noise.
:param sentences: A list of sentences
:param noise_fn: A noise function: Given a string, it returns a string with noise, e.g. deleted words
"""
def __init__(self, sentences: List[str], noise_fn=lambda s: DenoisingAutoEncoderDataset.delete(s)):
if not is_nltk_available():
raise ImportError(NLTK_IMPORT_ERROR.format(self.__class__.__name__))
self.sentences = sentences
self.noise_fn = noise_fn
def __getitem__(self, item):
sent = self.sentences[item]
return InputExample(texts=[self.noise_fn(sent), sent])
def __len__(self):
return len(self.sentences)
# Deletion noise.
@staticmethod
def delete(text, del_ratio=0.6):
from nltk import word_tokenize, TreebankWordDetokenizer
words = word_tokenize(text)
n = len(words)
if n == 0:
return text
keep_or_not = np.random.rand(n) > del_ratio
if sum(keep_or_not) == 0:
keep_or_not[np.random.choice(n)] = True # guarantee that at least one word remains
words_processed = TreebankWordDetokenizer().detokenize(np.array(words)[keep_or_not])
return words_processed
import random
import math
class NoDuplicatesDataLoader:
def __init__(self, train_examples, batch_size):
"""
A special data loader to be used with MultipleNegativesRankingLoss.
The data loader ensures that there are no duplicate sentences within the same batch
"""
self.batch_size = batch_size
self.data_pointer = 0
self.collate_fn = None
self.train_examples = train_examples
random.shuffle(self.train_examples)
def __iter__(self):
for _ in range(self.__len__()):
batch = []
texts_in_batch = set()
while len(batch) < self.batch_size:
example = self.train_examples[self.data_pointer]
valid_example = True
for text in example.texts:
if text.strip().lower() in texts_in_batch:
valid_example = False
break
if valid_example:
batch.append(example)
for text in example.texts:
texts_in_batch.add(text.strip().lower())
self.data_pointer += 1
if self.data_pointer >= len(self.train_examples):
self.data_pointer = 0
random.shuffle(self.train_examples)
yield self.collate_fn(batch) if self.collate_fn is not None else batch
def __len__(self):
return math.floor(len(self.train_examples) / self.batch_size)
from torch.utils.data import Dataset
import logging
import gzip
from .. import SentenceTransformer
from ..readers import InputExample
from typing import List
import random
logger = logging.getLogger(__name__)
class ParallelSentencesDataset(Dataset):
"""
This dataset reader can be used to read-in parallel sentences, i.e., it reads in a file with tab-seperated sentences with the same
sentence in different languages. For example, the file can look like this (EN\tDE\tES):
hello world hallo welt hola mundo
second sentence zweiter satz segunda oración
The sentence in the first column will be mapped to a sentence embedding using the given the embedder. For example,
embedder is a mono-lingual sentence embedding method for English. The sentences in the other languages will also be
mapped to this English sentence embedding.
When getting a sample from the dataset, we get one sentence with the according sentence embedding for this sentence.
teacher_model can be any class that implement an encode function. The encode function gets a list of sentences and
returns a list of sentence embeddings
"""
def __init__(
self,
student_model: SentenceTransformer,
teacher_model: SentenceTransformer,
batch_size: int = 8,
use_embedding_cache: bool = True,
):
"""
Parallel sentences dataset reader to train student model given a teacher model
:param student_model: Student sentence embedding model that should be trained
:param teacher_model: Teacher model, that provides the sentence embeddings for the first column in the dataset file
"""
self.student_model = student_model
self.teacher_model = teacher_model
self.datasets = []
self.datasets_iterator = []
self.datasets_tokenized = []
self.dataset_indices = []
self.copy_dataset_indices = []
self.cache = []
self.batch_size = batch_size
self.use_embedding_cache = use_embedding_cache
self.embedding_cache = {}
self.num_sentences = 0
def load_data(self, filepath: str, weight: int = 100, max_sentences: int = None, max_sentence_length: int = 128):
"""
Reads in a tab-seperated .txt/.csv/.tsv or .gz file. The different columns contain the different translations of the sentence in the first column
:param filepath: Filepath to the file
:param weight: If more than one dataset is loaded with load_data: With which frequency should data be sampled from this dataset?
:param max_sentences: Max number of lines to be read from filepath
:param max_sentence_length: Skip the example if one of the sentences is has more characters than max_sentence_length
:param batch_size: Size for encoding parallel sentences
:return:
"""
logger.info("Load " + filepath)
parallel_sentences = []
with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
filepath, encoding="utf8"
) as fIn:
count = 0
for line in fIn:
sentences = line.strip().split("\t")
if (
max_sentence_length is not None
and max_sentence_length > 0
and max([len(sent) for sent in sentences]) > max_sentence_length
):
continue
parallel_sentences.append(sentences)
count += 1
if max_sentences is not None and max_sentences > 0 and count >= max_sentences:
break
self.add_dataset(
parallel_sentences, weight=weight, max_sentences=max_sentences, max_sentence_length=max_sentence_length
)
def add_dataset(
self,
parallel_sentences: List[List[str]],
weight: int = 100,
max_sentences: int = None,
max_sentence_length: int = 128,
):
sentences_map = {}
for sentences in parallel_sentences:
if (
max_sentence_length is not None
and max_sentence_length > 0
and max([len(sent) for sent in sentences]) > max_sentence_length
):
continue
source_sentence = sentences[0]
if source_sentence not in sentences_map:
sentences_map[source_sentence] = set()
for sent in sentences:
sentences_map[source_sentence].add(sent)
if max_sentences is not None and max_sentences > 0 and len(sentences_map) >= max_sentences:
break
if len(sentences_map) == 0:
return
self.num_sentences += sum([len(sentences_map[sent]) for sent in sentences_map])
dataset_id = len(self.datasets)
self.datasets.append(list(sentences_map.items()))
self.datasets_iterator.append(0)
self.dataset_indices.extend([dataset_id] * weight)
def generate_data(self):
source_sentences_list = []
target_sentences_list = []
for data_idx in self.dataset_indices:
src_sentence, trg_sentences = self.next_entry(data_idx)
source_sentences_list.append(src_sentence)
target_sentences_list.append(trg_sentences)
# Generate embeddings
src_embeddings = self.get_embeddings(source_sentences_list)
for src_embedding, trg_sentences in zip(src_embeddings, target_sentences_list):
for trg_sentence in trg_sentences:
self.cache.append(InputExample(texts=[trg_sentence], label=src_embedding))
random.shuffle(self.cache)
def next_entry(self, data_idx):
source, target_sentences = self.datasets[data_idx][self.datasets_iterator[data_idx]]
self.datasets_iterator[data_idx] += 1
if self.datasets_iterator[data_idx] >= len(self.datasets[data_idx]): # Restart iterator
self.datasets_iterator[data_idx] = 0
random.shuffle(self.datasets[data_idx])
return source, target_sentences
def get_embeddings(self, sentences):
if not self.use_embedding_cache:
return self.teacher_model.encode(
sentences, batch_size=self.batch_size, show_progress_bar=False, convert_to_numpy=True
)
# Use caching
new_sentences = []
for sent in sentences:
if sent not in self.embedding_cache:
new_sentences.append(sent)
if len(new_sentences) > 0:
new_embeddings = self.teacher_model.encode(
new_sentences, batch_size=self.batch_size, show_progress_bar=False, convert_to_numpy=True
)
for sent, embedding in zip(new_sentences, new_embeddings):
self.embedding_cache[sent] = embedding
return [self.embedding_cache[sent] for sent in sentences]
def __len__(self):
return self.num_sentences
def __getitem__(self, idx):
if len(self.cache) == 0:
self.generate_data()
return self.cache.pop()
""" """
from torch.utils.data import IterableDataset
import numpy as np
from typing import List
from ..readers import InputExample
import logging
logger = logging.getLogger(__name__)
class SentenceLabelDataset(IterableDataset):
"""
This dataset can be used for some specific Triplet Losses like BATCH_HARD_TRIPLET_LOSS which requires
multiple examples with the same label in a batch.
It draws n consecutive, random and unique samples from one label at a time. This is repeated for each label.
Labels with fewer than n unique samples are ignored.
This also applied to drawing without replacement, once less than n samples remain for a label, it is skipped.
This *DOES NOT* check if there are more labels than the batch is large or if the batch size is divisible
by the samples drawn per label.
"""
def __init__(self, examples: List[InputExample], samples_per_label: int = 2, with_replacement: bool = False):
"""
Creates a LabelSampler for a SentenceLabelDataset.
:param examples:
a list with InputExamples
:param samples_per_label:
the number of consecutive, random and unique samples drawn per label. Batch size should be a multiple of samples_per_label
:param with_replacement:
if this is True, then each sample is drawn at most once (depending on the total number of samples per label).
if this is False, then one sample can be drawn in multiple draws, but still not multiple times in the same
drawing.
"""
super().__init__()
self.samples_per_label = samples_per_label
# Group examples by label
label2ex = {}
for example in examples:
if example.label not in label2ex:
label2ex[example.label] = []
label2ex[example.label].append(example)
# Include only labels with at least 2 examples
self.grouped_inputs = []
self.groups_right_border = []
num_labels = 0
for label, label_examples in label2ex.items():
if len(label_examples) >= self.samples_per_label:
self.grouped_inputs.extend(label_examples)
self.groups_right_border.append(
len(self.grouped_inputs)
) # At which position does this label group / bucket end?
num_labels += 1
self.label_range = np.arange(num_labels)
self.with_replacement = with_replacement
np.random.shuffle(self.label_range)
logger.info(
"SentenceLabelDataset: {} examples, from which {} examples could be used (those labels appeared at least {} times). {} different labels found.".format(
len(examples), len(self.grouped_inputs), self.samples_per_label, num_labels
)
)
def __iter__(self):
label_idx = 0
count = 0
already_seen = {}
while count < len(self.grouped_inputs):
label = self.label_range[label_idx]
if label not in already_seen:
already_seen[label] = set()
left_border = 0 if label == 0 else self.groups_right_border[label - 1]
right_border = self.groups_right_border[label]
if self.with_replacement:
selection = np.arange(left_border, right_border)
else:
selection = [i for i in np.arange(left_border, right_border) if i not in already_seen[label]]
if len(selection) >= self.samples_per_label:
for element_idx in np.random.choice(selection, self.samples_per_label, replace=False):
count += 1
already_seen[label].add(element_idx)
yield self.grouped_inputs[element_idx]
label_idx += 1
if label_idx >= len(self.label_range):
label_idx = 0
already_seen = {}
np.random.shuffle(self.label_range)
def __len__(self):
return len(self.grouped_inputs)
from torch.utils.data import Dataset
from typing import List
from .. import SentenceTransformer
from ..readers.InputExample import InputExample
class SentencesDataset(Dataset):
"""
DEPRECATED: This class is no longer used. Instead of wrapping your List of InputExamples in a SentencesDataset
and then passing it to the DataLoader, you can pass the list of InputExamples directly to the dataset loader.
"""
def __init__(self, examples: List[InputExample], model: SentenceTransformer):
self.examples = examples
def __getitem__(self, item):
return self.examples[item]
def __len__(self):
return len(self.examples)
from .DenoisingAutoEncoderDataset import DenoisingAutoEncoderDataset
from .NoDuplicatesDataLoader import NoDuplicatesDataLoader
from .ParallelSentencesDataset import ParallelSentencesDataset
from .SentencesDataset import SentencesDataset
from .SentenceLabelDataset import SentenceLabelDataset
__all__ = [
"DenoisingAutoEncoderDataset",
"NoDuplicatesDataLoader",
"ParallelSentencesDataset",
"SentencesDataset",
"SentenceLabelDataset",
]
from . import SentenceEvaluator
import logging
import os
import csv
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from sklearn.metrics import average_precision_score
import numpy as np
from typing import List
from ..readers import InputExample
logger = logging.getLogger(__name__)
class BinaryClassificationEvaluator(SentenceEvaluator):
"""
Evaluate a model based on the similarity of the embeddings by calculating the accuracy of identifying similar and
dissimilar sentences.
The metrics are the cosine similarity as well as euclidean and Manhattan distance
The returned score is the accuracy with a specified metric.
The results are written in a CSV. If a CSV already exists, then values are appended.
The labels need to be 0 for dissimilar pairs and 1 for similar pairs.
:param sentences1: The first column of sentences
:param sentences2: The second column of sentences
:param labels: labels[i] is the label for the pair (sentences1[i], sentences2[i]). Must be 0 or 1
:param name: Name for the output
:param batch_size: Batch size used to compute embeddings
:param show_progress_bar: If true, prints a progress bar
:param write_csv: Write results to a CSV file
"""
def __init__(
self,
sentences1: List[str],
sentences2: List[str],
labels: List[int],
name: str = "",
batch_size: int = 32,
show_progress_bar: bool = False,
write_csv: bool = True,
):
self.sentences1 = sentences1
self.sentences2 = sentences2
self.labels = labels
assert len(self.sentences1) == len(self.sentences2)
assert len(self.sentences1) == len(self.labels)
for label in labels:
assert label == 0 or label == 1
self.write_csv = write_csv
self.name = name
self.batch_size = batch_size
if show_progress_bar is None:
show_progress_bar = (
logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
)
self.show_progress_bar = show_progress_bar
self.csv_file = "binary_classification_evaluation" + ("_" + name if name else "") + "_results.csv"
self.csv_headers = [
"epoch",
"steps",
"cossim_accuracy",
"cossim_accuracy_threshold",
"cossim_f1",
"cossim_precision",
"cossim_recall",
"cossim_f1_threshold",
"cossim_ap",
"manhattan_accuracy",
"manhattan_accuracy_threshold",
"manhattan_f1",
"manhattan_precision",
"manhattan_recall",
"manhattan_f1_threshold",
"manhattan_ap",
"euclidean_accuracy",
"euclidean_accuracy_threshold",
"euclidean_f1",
"euclidean_precision",
"euclidean_recall",
"euclidean_f1_threshold",
"euclidean_ap",
"dot_accuracy",
"dot_accuracy_threshold",
"dot_f1",
"dot_precision",
"dot_recall",
"dot_f1_threshold",
"dot_ap",
]
@classmethod
def from_input_examples(cls, examples: List[InputExample], **kwargs):
sentences1 = []
sentences2 = []
scores = []
for example in examples:
sentences1.append(example.texts[0])
sentences2.append(example.texts[1])
scores.append(example.label)
return cls(sentences1, sentences2, scores, **kwargs)
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
if epoch != -1:
if steps == -1:
out_txt = f" after epoch {epoch}:"
else:
out_txt = f" in epoch {epoch} after {steps} steps:"
else:
out_txt = ":"
logger.info("Binary Accuracy Evaluation of the model on " + self.name + " dataset" + out_txt)
scores = self.compute_metrices(model)
# Main score is the max of Average Precision (AP)
main_score = max(scores[short_name]["ap"] for short_name in scores)
file_output_data = [epoch, steps]
for header_name in self.csv_headers:
if "_" in header_name:
sim_fct, metric = header_name.split("_", maxsplit=1)
file_output_data.append(scores[sim_fct][metric])
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
if not os.path.isfile(csv_path):
with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(self.csv_headers)
writer.writerow(file_output_data)
else:
with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(file_output_data)
return main_score
def compute_metrices(self, model):
try:
# If the sentences are hashable, then we can use a set to avoid embedding the same sentences multiple times
sentences = list(set(self.sentences1 + self.sentences2))
embeddings = model.encode(
sentences, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
)
emb_dict = {sent: emb for sent, emb in zip(sentences, embeddings)}
embeddings1 = [emb_dict[sent] for sent in self.sentences1]
embeddings2 = [emb_dict[sent] for sent in self.sentences2]
except TypeError:
# Otherwise we just embed everything, e.g. if the sentences are images for evaluating a CLIP model
embeddings = model.encode(
self.sentences1 + self.sentences2,
batch_size=self.batch_size,
show_progress_bar=self.show_progress_bar,
convert_to_numpy=True,
)
embeddings1 = embeddings[: len(self.sentences1)]
embeddings2 = embeddings[len(self.sentences1) :]
cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
manhattan_distances = paired_manhattan_distances(embeddings1, embeddings2)
euclidean_distances = paired_euclidean_distances(embeddings1, embeddings2)
embeddings1_np = np.asarray(embeddings1)
embeddings2_np = np.asarray(embeddings2)
dot_scores = [np.dot(embeddings1_np[i], embeddings2_np[i]) for i in range(len(embeddings1_np))]
labels = np.asarray(self.labels)
output_scores = {}
for short_name, name, scores, reverse in [
["cossim", "Cosine-Similarity", cosine_scores, True],
["manhattan", "Manhattan-Distance", manhattan_distances, False],
["euclidean", "Euclidean-Distance", euclidean_distances, False],
["dot", "Dot-Product", dot_scores, True],
]:
acc, acc_threshold = self.find_best_acc_and_threshold(scores, labels, reverse)
f1, precision, recall, f1_threshold = self.find_best_f1_and_threshold(scores, labels, reverse)
ap = average_precision_score(labels, scores * (1 if reverse else -1))
logger.info(
"Accuracy with {}: {:.2f}\t(Threshold: {:.4f})".format(name, acc * 100, acc_threshold)
)
logger.info("F1 with {}: {:.2f}\t(Threshold: {:.4f})".format(name, f1 * 100, f1_threshold))
logger.info("Precision with {}: {:.2f}".format(name, precision * 100))
logger.info("Recall with {}: {:.2f}".format(name, recall * 100))
logger.info("Average Precision with {}: {:.2f}\n".format(name, ap * 100))
output_scores[short_name] = {
"accuracy": acc,
"accuracy_threshold": acc_threshold,
"f1": f1,
"f1_threshold": f1_threshold,
"precision": precision,
"recall": recall,
"ap": ap,
}
return output_scores
@staticmethod
def find_best_acc_and_threshold(scores, labels, high_score_more_similar: bool):
assert len(scores) == len(labels)
rows = list(zip(scores, labels))
rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
max_acc = 0
best_threshold = -1
positive_so_far = 0
remaining_negatives = sum(labels == 0)
for i in range(len(rows) - 1):
score, label = rows[i]
if label == 1:
positive_so_far += 1
else:
remaining_negatives -= 1
acc = (positive_so_far + remaining_negatives) / len(labels)
if acc > max_acc:
max_acc = acc
best_threshold = (rows[i][0] + rows[i + 1][0]) / 2
return max_acc, best_threshold
@staticmethod
def find_best_f1_and_threshold(scores, labels, high_score_more_similar: bool):
assert len(scores) == len(labels)
scores = np.asarray(scores)
labels = np.asarray(labels)
rows = list(zip(scores, labels))
rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
best_f1 = best_precision = best_recall = 0
threshold = 0
nextract = 0
ncorrect = 0
total_num_duplicates = sum(labels)
for i in range(len(rows) - 1):
score, label = rows[i]
nextract += 1
if label == 1:
ncorrect += 1
if ncorrect > 0:
precision = ncorrect / nextract
recall = ncorrect / total_num_duplicates
f1 = 2 * precision * recall / (precision + recall)
if f1 > best_f1:
best_f1 = f1
best_precision = precision
best_recall = recall
threshold = (rows[i][0] + rows[i + 1][0]) / 2
return best_f1, best_precision, best_recall, threshold
from contextlib import nullcontext
from . import SentenceEvaluator, SimilarityFunction
import logging
import os
import csv
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from scipy.stats import pearsonr, spearmanr
import numpy as np
from typing import List, Literal, Optional
from ..readers import InputExample
logger = logging.getLogger(__name__)
class EmbeddingSimilarityEvaluator(SentenceEvaluator):
"""
Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation
in comparison to the gold standard labels.
The metrics are the cosine similarity as well as euclidean and Manhattan distance
The returned score is the Spearman correlation with a specified metric.
The results are written in a CSV. If a CSV already exists, then values are appended.
"""
def __init__(
self,
sentences1: List[str],
sentences2: List[str],
scores: List[float],
batch_size: int = 16,
main_similarity: SimilarityFunction = None,
name: str = "",
show_progress_bar: bool = False,
write_csv: bool = True,
precision: Optional[Literal["float32", "int8", "uint8", "binary", "ubinary"]] = None,
truncate_dim: Optional[int] = None,
):
"""
Constructs an evaluator based for the dataset
The labels need to indicate the similarity between the sentences.
:param sentences1: List with the first sentence in a pair
:param sentences2: List with the second sentence in a pair
:param scores: Similarity score between sentences1[i] and sentences2[i]
:param write_csv: Write results to a CSV file
:param precision: The precision to use for the embeddings. Can be "float32", "int8", "uint8", "binary", or
"ubinary". Defaults to None.
:param truncate_dim: The dimension to truncate sentence embeddings to. `None` uses the model's current
truncation dimension. Defaults to None.
"""
self.sentences1 = sentences1
self.sentences2 = sentences2
self.scores = scores
self.write_csv = write_csv
self.precision = precision
self.truncate_dim = truncate_dim
assert len(self.sentences1) == len(self.sentences2)
assert len(self.sentences1) == len(self.scores)
self.main_similarity = main_similarity
self.name = name
self.batch_size = batch_size
if show_progress_bar is None:
show_progress_bar = (
logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
)
self.show_progress_bar = show_progress_bar
self.csv_file = (
"similarity_evaluation"
+ ("_" + name if name else "")
+ ("_" + precision if precision else "")
+ "_results.csv"
)
self.csv_headers = [
"epoch",
"steps",
"cosine_pearson",
"cosine_spearman",
"euclidean_pearson",
"euclidean_spearman",
"manhattan_pearson",
"manhattan_spearman",
"dot_pearson",
"dot_spearman",
]
@classmethod
def from_input_examples(cls, examples: List[InputExample], **kwargs):
sentences1 = []
sentences2 = []
scores = []
for example in examples:
sentences1.append(example.texts[0])
sentences2.append(example.texts[1])
scores.append(example.label)
return cls(sentences1, sentences2, scores, **kwargs)
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
if epoch != -1:
if steps == -1:
out_txt = " after epoch {}:".format(epoch)
else:
out_txt = " in epoch {} after {} steps:".format(epoch, steps)
else:
out_txt = ":"
logger.info("EmbeddingSimilarityEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
with nullcontext() if self.truncate_dim is None else model.truncate_sentence_embeddings(self.truncate_dim):
embeddings1 = model.encode(
self.sentences1,
batch_size=self.batch_size,
show_progress_bar=self.show_progress_bar,
convert_to_numpy=True,
precision=self.precision,
normalize_embeddings=bool(self.precision),
)
embeddings2 = model.encode(
self.sentences2,
batch_size=self.batch_size,
show_progress_bar=self.show_progress_bar,
convert_to_numpy=True,
precision=self.precision,
normalize_embeddings=bool(self.precision),
)
# Binary and ubinary embeddings are packed, so we need to unpack them for the distance metrics
if self.precision == "binary":
embeddings1 = (embeddings1 + 128).astype(np.uint8)
embeddings2 = (embeddings2 + 128).astype(np.uint8)
if self.precision in ("ubinary", "binary"):
embeddings1 = np.unpackbits(embeddings1, axis=1)
embeddings2 = np.unpackbits(embeddings2, axis=1)
labels = self.scores
cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]
eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
eval_pearson_dot, _ = pearsonr(labels, dot_products)
eval_spearman_dot, _ = spearmanr(labels, dot_products)
logger.info(
"Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format(eval_pearson_cosine, eval_spearman_cosine)
)
logger.info(
"Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
eval_pearson_manhattan, eval_spearman_manhattan
)
)
logger.info(
"Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
eval_pearson_euclidean, eval_spearman_euclidean
)
)
logger.info(
"Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format(eval_pearson_dot, eval_spearman_dot)
)
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
output_file_exists = os.path.isfile(csv_path)
with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
writer = csv.writer(f)
if not output_file_exists:
writer.writerow(self.csv_headers)
writer.writerow(
[
epoch,
steps,
eval_pearson_cosine,
eval_spearman_cosine,
eval_pearson_euclidean,
eval_spearman_euclidean,
eval_pearson_manhattan,
eval_spearman_manhattan,
eval_pearson_dot,
eval_spearman_dot,
]
)
if self.main_similarity == SimilarityFunction.COSINE:
return eval_spearman_cosine
elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
return eval_spearman_euclidean
elif self.main_similarity == SimilarityFunction.MANHATTAN:
return eval_spearman_manhattan
elif self.main_similarity == SimilarityFunction.DOT_PRODUCT:
return eval_spearman_dot
elif self.main_similarity is None:
return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot)
else:
raise ValueError("Unknown main_similarity value")
from . import SentenceEvaluator
import torch
from torch import Tensor
import logging
from tqdm import trange
from ..util import cos_sim, dot_score
import os
import numpy as np
from typing import List, Dict, Set, Callable
import heapq
logger = logging.getLogger(__name__)
class InformationRetrievalEvaluator(SentenceEvaluator):
"""
This class evaluates an Information Retrieval (IR) setting.
Given a set of queries and a large corpus set. It will retrieve for each query the top-k most similar document. It measures
Mean Reciprocal Rank (MRR), Recall@k, and Normalized Discounted Cumulative Gain (NDCG)
"""
def __init__(
self,
queries: Dict[str, str], # qid => query
corpus: Dict[str, str], # cid => doc
relevant_docs: Dict[str, Set[str]], # qid => Set[cid]
corpus_chunk_size: int = 50000,
mrr_at_k: List[int] = [10],
ndcg_at_k: List[int] = [10],
accuracy_at_k: List[int] = [1, 3, 5, 10],
precision_recall_at_k: List[int] = [1, 3, 5, 10],
map_at_k: List[int] = [100],
show_progress_bar: bool = False,
batch_size: int = 32,
name: str = "",
write_csv: bool = True,
score_functions: Dict[str, Callable[[Tensor, Tensor], Tensor]] = {
"cos_sim": cos_sim,
"dot_score": dot_score,
}, # Score function, higher=more similar
main_score_function: str = None,
):
self.queries_ids = []
for qid in queries:
if qid in relevant_docs and len(relevant_docs[qid]) > 0:
self.queries_ids.append(qid)
self.queries = [queries[qid] for qid in self.queries_ids]
self.corpus_ids = list(corpus.keys())
self.corpus = [corpus[cid] for cid in self.corpus_ids]
self.relevant_docs = relevant_docs
self.corpus_chunk_size = corpus_chunk_size
self.mrr_at_k = mrr_at_k
self.ndcg_at_k = ndcg_at_k
self.accuracy_at_k = accuracy_at_k
self.precision_recall_at_k = precision_recall_at_k
self.map_at_k = map_at_k
self.show_progress_bar = show_progress_bar
self.batch_size = batch_size
self.name = name
self.write_csv = write_csv
self.score_functions = score_functions
self.score_function_names = sorted(list(self.score_functions.keys()))
self.main_score_function = main_score_function
if name:
name = "_" + name
self.csv_file: str = "Information-Retrieval_evaluation" + name + "_results.csv"
self.csv_headers = ["epoch", "steps"]
for score_name in self.score_function_names:
for k in accuracy_at_k:
self.csv_headers.append("{}-Accuracy@{}".format(score_name, k))
for k in precision_recall_at_k:
self.csv_headers.append("{}-Precision@{}".format(score_name, k))
self.csv_headers.append("{}-Recall@{}".format(score_name, k))
for k in mrr_at_k:
self.csv_headers.append("{}-MRR@{}".format(score_name, k))
for k in ndcg_at_k:
self.csv_headers.append("{}-NDCG@{}".format(score_name, k))
for k in map_at_k:
self.csv_headers.append("{}-MAP@{}".format(score_name, k))
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1, *args, **kwargs) -> float:
if epoch != -1:
out_txt = (
" after epoch {}:".format(epoch)
if steps == -1
else " in epoch {} after {} steps:".format(epoch, steps)
)
else:
out_txt = ":"
logger.info("Information Retrieval Evaluation on " + self.name + " dataset" + out_txt)
scores = self.compute_metrices(model, *args, **kwargs)
# Write results to disc
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
if not os.path.isfile(csv_path):
fOut = open(csv_path, mode="w", encoding="utf-8")
fOut.write(",".join(self.csv_headers))
fOut.write("\n")
else:
fOut = open(csv_path, mode="a", encoding="utf-8")
output_data = [epoch, steps]
for name in self.score_function_names:
for k in self.accuracy_at_k:
output_data.append(scores[name]["accuracy@k"][k])
for k in self.precision_recall_at_k:
output_data.append(scores[name]["precision@k"][k])
output_data.append(scores[name]["recall@k"][k])
for k in self.mrr_at_k:
output_data.append(scores[name]["mrr@k"][k])
for k in self.ndcg_at_k:
output_data.append(scores[name]["ndcg@k"][k])
for k in self.map_at_k:
output_data.append(scores[name]["map@k"][k])
fOut.write(",".join(map(str, output_data)))
fOut.write("\n")
fOut.close()
if self.main_score_function is None:
return max([scores[name]["map@k"][max(self.map_at_k)] for name in self.score_function_names])
else:
return scores[self.main_score_function]["map@k"][max(self.map_at_k)]
def compute_metrices(self, model, corpus_model=None, corpus_embeddings: Tensor = None) -> Dict[str, float]:
if corpus_model is None:
corpus_model = model
max_k = max(
max(self.mrr_at_k),
max(self.ndcg_at_k),
max(self.accuracy_at_k),
max(self.precision_recall_at_k),
max(self.map_at_k),
)
# Compute embedding for the queries
query_embeddings = model.encode(
self.queries, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_tensor=True
)
queries_result_list = {}
for name in self.score_functions:
queries_result_list[name] = [[] for _ in range(len(query_embeddings))]
# Iterate over chunks of the corpus
for corpus_start_idx in trange(
0, len(self.corpus), self.corpus_chunk_size, desc="Corpus Chunks", disable=not self.show_progress_bar
):
corpus_end_idx = min(corpus_start_idx + self.corpus_chunk_size, len(self.corpus))
# Encode chunk of corpus
if corpus_embeddings is None:
sub_corpus_embeddings = corpus_model.encode(
self.corpus[corpus_start_idx:corpus_end_idx],
show_progress_bar=False,
batch_size=self.batch_size,
convert_to_tensor=True,
)
else:
sub_corpus_embeddings = corpus_embeddings[corpus_start_idx:corpus_end_idx]
# Compute cosine similarites
for name, score_function in self.score_functions.items():
pair_scores = score_function(query_embeddings, sub_corpus_embeddings)
# Get top-k values
pair_scores_top_k_values, pair_scores_top_k_idx = torch.topk(
pair_scores, min(max_k, len(pair_scores[0])), dim=1, largest=True, sorted=False
)
pair_scores_top_k_values = pair_scores_top_k_values.cpu().tolist()
pair_scores_top_k_idx = pair_scores_top_k_idx.cpu().tolist()
for query_itr in range(len(query_embeddings)):
for sub_corpus_id, score in zip(
pair_scores_top_k_idx[query_itr], pair_scores_top_k_values[query_itr]
):
corpus_id = self.corpus_ids[corpus_start_idx + sub_corpus_id]
if len(queries_result_list[name][query_itr]) < max_k:
heapq.heappush(
queries_result_list[name][query_itr], (score, corpus_id)
) # heaqp tracks the quantity of the first element in the tuple
else:
heapq.heappushpop(queries_result_list[name][query_itr], (score, corpus_id))
for name in queries_result_list:
for query_itr in range(len(queries_result_list[name])):
for doc_itr in range(len(queries_result_list[name][query_itr])):
score, corpus_id = queries_result_list[name][query_itr][doc_itr]
queries_result_list[name][query_itr][doc_itr] = {"corpus_id": corpus_id, "score": score}
logger.info("Queries: {}".format(len(self.queries)))
logger.info("Corpus: {}\n".format(len(self.corpus)))
# Compute scores
scores = {name: self.compute_metrics(queries_result_list[name]) for name in self.score_functions}
# Output
for name in self.score_function_names:
logger.info("Score-Function: {}".format(name))
self.output_scores(scores[name])
return scores
def compute_metrics(self, queries_result_list: List[object]):
# Init score computation values
num_hits_at_k = {k: 0 for k in self.accuracy_at_k}
precisions_at_k = {k: [] for k in self.precision_recall_at_k}
recall_at_k = {k: [] for k in self.precision_recall_at_k}
MRR = {k: 0 for k in self.mrr_at_k}
ndcg = {k: [] for k in self.ndcg_at_k}
AveP_at_k = {k: [] for k in self.map_at_k}
# Compute scores on results
for query_itr in range(len(queries_result_list)):
query_id = self.queries_ids[query_itr]
# Sort scores
top_hits = sorted(queries_result_list[query_itr], key=lambda x: x["score"], reverse=True)
query_relevant_docs = self.relevant_docs[query_id]
# Accuracy@k - We count the result correct, if at least one relevant doc is across the top-k documents
for k_val in self.accuracy_at_k:
for hit in top_hits[0:k_val]:
if hit["corpus_id"] in query_relevant_docs:
num_hits_at_k[k_val] += 1
break
# Precision and Recall@k
for k_val in self.precision_recall_at_k:
num_correct = 0
for hit in top_hits[0:k_val]:
if hit["corpus_id"] in query_relevant_docs:
num_correct += 1
precisions_at_k[k_val].append(num_correct / k_val)
recall_at_k[k_val].append(num_correct / len(query_relevant_docs))
# MRR@k
for k_val in self.mrr_at_k:
for rank, hit in enumerate(top_hits[0:k_val]):
if hit["corpus_id"] in query_relevant_docs:
MRR[k_val] += 1.0 / (rank + 1)
break
# NDCG@k
for k_val in self.ndcg_at_k:
predicted_relevance = [
1 if top_hit["corpus_id"] in query_relevant_docs else 0 for top_hit in top_hits[0:k_val]
]
true_relevances = [1] * len(query_relevant_docs)
ndcg_value = self.compute_dcg_at_k(predicted_relevance, k_val) / self.compute_dcg_at_k(
true_relevances, k_val
)
ndcg[k_val].append(ndcg_value)
# MAP@k
for k_val in self.map_at_k:
num_correct = 0
sum_precisions = 0
for rank, hit in enumerate(top_hits[0:k_val]):
if hit["corpus_id"] in query_relevant_docs:
num_correct += 1
sum_precisions += num_correct / (rank + 1)
avg_precision = sum_precisions / min(k_val, len(query_relevant_docs))
AveP_at_k[k_val].append(avg_precision)
# Compute averages
for k in num_hits_at_k:
num_hits_at_k[k] /= len(self.queries)
for k in precisions_at_k:
precisions_at_k[k] = np.mean(precisions_at_k[k])
for k in recall_at_k:
recall_at_k[k] = np.mean(recall_at_k[k])
for k in ndcg:
ndcg[k] = np.mean(ndcg[k])
for k in MRR:
MRR[k] /= len(self.queries)
for k in AveP_at_k:
AveP_at_k[k] = np.mean(AveP_at_k[k])
return {
"accuracy@k": num_hits_at_k,
"precision@k": precisions_at_k,
"recall@k": recall_at_k,
"ndcg@k": ndcg,
"mrr@k": MRR,
"map@k": AveP_at_k,
}
def output_scores(self, scores):
for k in scores["accuracy@k"]:
logger.info("Accuracy@{}: {:.2f}%".format(k, scores["accuracy@k"][k] * 100))
for k in scores["precision@k"]:
logger.info("Precision@{}: {:.2f}%".format(k, scores["precision@k"][k] * 100))
for k in scores["recall@k"]:
logger.info("Recall@{}: {:.2f}%".format(k, scores["recall@k"][k] * 100))
for k in scores["mrr@k"]:
logger.info("MRR@{}: {:.4f}".format(k, scores["mrr@k"][k]))
for k in scores["ndcg@k"]:
logger.info("NDCG@{}: {:.4f}".format(k, scores["ndcg@k"][k]))
for k in scores["map@k"]:
logger.info("MAP@{}: {:.4f}".format(k, scores["map@k"][k]))
@staticmethod
def compute_dcg_at_k(relevances, k):
dcg = 0
for i in range(min(len(relevances), k)):
dcg += relevances[i] / np.log2(i + 2) # +2 as we start our idx at 0
return dcg
from . import SentenceEvaluator
import torch
from torch.utils.data import DataLoader
import logging
from ..util import batch_to_device
import os
import csv
logger = logging.getLogger(__name__)
class LabelAccuracyEvaluator(SentenceEvaluator):
"""
Evaluate a model based on its accuracy on a labeled dataset
This requires a model with LossFunction.SOFTMAX
The results are written in a CSV. If a CSV already exists, then values are appended.
"""
def __init__(self, dataloader: DataLoader, name: str = "", softmax_model=None, write_csv: bool = True):
"""
Constructs an evaluator for the given dataset
:param dataloader:
the data for the evaluation
"""
self.dataloader = dataloader
self.name = name
self.softmax_model = softmax_model
if name:
name = "_" + name
self.write_csv = write_csv
self.csv_file = "accuracy_evaluation" + name + "_results.csv"
self.csv_headers = ["epoch", "steps", "accuracy"]
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
model.eval()
total = 0
correct = 0
if epoch != -1:
if steps == -1:
out_txt = " after epoch {}:".format(epoch)
else:
out_txt = " in epoch {} after {} steps:".format(epoch, steps)
else:
out_txt = ":"
logger.info("Evaluation on the " + self.name + " dataset" + out_txt)
self.dataloader.collate_fn = model.smart_batching_collate
for step, batch in enumerate(self.dataloader):
features, label_ids = batch
for idx in range(len(features)):
features[idx] = batch_to_device(features[idx], model.device)
label_ids = label_ids.to(model.device)
with torch.no_grad():
_, prediction = self.softmax_model(features, labels=None)
total += prediction.size(0)
correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item()
accuracy = correct / total
logger.info("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total))
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
if not os.path.isfile(csv_path):
with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(self.csv_headers)
writer.writerow([epoch, steps, accuracy])
else:
with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow([epoch, steps, accuracy])
return accuracy
from sentence_transformers.evaluation import SentenceEvaluator
import logging
import os
import csv
from typing import List
logger = logging.getLogger(__name__)
class MSEEvaluator(SentenceEvaluator):
"""
Computes the mean squared error (x100) between the computed sentence embedding
and some target sentence embedding.
The MSE is computed between ||teacher.encode(source_sentences) - student.encode(target_sentences)||.
For multilingual knowledge distillation (https://arxiv.org/abs/2004.09813), source_sentences are in English
and target_sentences are in a different language like German, Chinese, Spanish...
:param source_sentences: Source sentences are embedded with the teacher model
:param target_sentences: Target sentences are ambedding with the student model.
:param show_progress_bar: Show progress bar when computing embeddings
:param batch_size: Batch size to compute sentence embeddings
:param name: Name of the evaluator
:param write_csv: Write results to CSV file
"""
def __init__(
self,
source_sentences: List[str],
target_sentences: List[str],
teacher_model=None,
show_progress_bar: bool = False,
batch_size: int = 32,
name: str = "",
write_csv: bool = True,
):
self.source_embeddings = teacher_model.encode(
source_sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_numpy=True
)
self.target_sentences = target_sentences
self.show_progress_bar = show_progress_bar
self.batch_size = batch_size
self.name = name
self.csv_file = "mse_evaluation_" + name + "_results.csv"
self.csv_headers = ["epoch", "steps", "MSE"]
self.write_csv = write_csv
def __call__(self, model, output_path, epoch=-1, steps=-1):
if epoch != -1:
if steps == -1:
out_txt = " after epoch {}:".format(epoch)
else:
out_txt = " in epoch {} after {} steps:".format(epoch, steps)
else:
out_txt = ":"
target_embeddings = model.encode(
self.target_sentences,
show_progress_bar=self.show_progress_bar,
batch_size=self.batch_size,
convert_to_numpy=True,
)
mse = ((self.source_embeddings - target_embeddings) ** 2).mean()
mse *= 100
logger.info("MSE evaluation (lower = better) on " + self.name + " dataset" + out_txt)
logger.info("MSE (*100):\t{:4f}".format(mse))
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
output_file_exists = os.path.isfile(csv_path)
with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
writer = csv.writer(f)
if not output_file_exists:
writer.writerow(self.csv_headers)
writer.writerow([epoch, steps, mse])
return -mse # Return negative score as SentenceTransformers maximizes the performance
from sentence_transformers.evaluation import SentenceEvaluator
from sentence_transformers import SentenceTransformer
from typing import List, Tuple, Dict
import numpy as np
import logging
import os
import csv
logger = logging.getLogger(__name__)
class MSEEvaluatorFromDataFrame(SentenceEvaluator):
"""
Computes the mean squared error (x100) between the computed sentence embedding and some target sentence embedding.
:param dataframe: It must have the following format. Rows contains different, parallel sentences.
Columns are the respective language codes::
[{'en': 'My sentence', 'es': 'Sentence in Spanisch', 'fr': 'Sentence in French'...},
{'en': 'My second sentence', ...}]
:param combinations: Must be of the format ``[('en', 'es'), ('en', 'fr'), ...]``.
First entry in a tuple is the source language. The sentence in the respective language will be fetched from
the dataframe and passed to the teacher model. Second entry in a tuple the the target language. Sentence
will be fetched from the dataframe and passed to the student model
"""
def __init__(
self,
dataframe: List[Dict[str, str]],
teacher_model: SentenceTransformer,
combinations: List[Tuple[str, str]],
batch_size: int = 8,
name="",
write_csv: bool = True,
):
self.combinations = combinations
self.name = name
self.batch_size = batch_size
if name:
name = "_" + name
self.csv_file = "mse_evaluation" + name + "_results.csv"
self.csv_headers = ["epoch", "steps"]
self.write_csv = write_csv
self.data = {}
logger.info("Compute teacher embeddings")
all_source_sentences = set()
for src_lang, trg_lang in self.combinations:
src_sentences = []
trg_sentences = []
for row in dataframe:
if row[src_lang].strip() != "" and row[trg_lang].strip() != "":
all_source_sentences.add(row[src_lang])
src_sentences.append(row[src_lang])
trg_sentences.append(row[trg_lang])
self.data[(src_lang, trg_lang)] = (src_sentences, trg_sentences)
self.csv_headers.append("{}-{}".format(src_lang, trg_lang))
all_source_sentences = list(all_source_sentences)
all_src_embeddings = teacher_model.encode(all_source_sentences, batch_size=self.batch_size)
self.teacher_embeddings = {sent: emb for sent, emb in zip(all_source_sentences, all_src_embeddings)}
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1):
model.eval()
mse_scores = []
for src_lang, trg_lang in self.combinations:
src_sentences, trg_sentences = self.data[(src_lang, trg_lang)]
src_embeddings = np.asarray([self.teacher_embeddings[sent] for sent in src_sentences])
trg_embeddings = np.asarray(model.encode(trg_sentences, batch_size=self.batch_size))
mse = ((src_embeddings - trg_embeddings) ** 2).mean()
mse *= 100
mse_scores.append(mse)
logger.info("MSE evaluation on {} dataset - {}-{}:".format(self.name, src_lang, trg_lang))
logger.info("MSE (*100):\t{:4f}".format(mse))
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
output_file_exists = os.path.isfile(csv_path)
with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
writer = csv.writer(f)
if not output_file_exists:
writer.writerow(self.csv_headers)
writer.writerow([epoch, steps] + mse_scores)
return -np.mean(mse_scores) # Return negative score as SentenceTransformers maximizes the performance
from . import SentenceEvaluator
import logging
from sentence_transformers.util import paraphrase_mining
import os
import csv
from typing import List, Tuple, Dict
from collections import defaultdict
logger = logging.getLogger(__name__)
class ParaphraseMiningEvaluator(SentenceEvaluator):
"""
Given a large set of sentences, this evaluator performs paraphrase (duplicate) mining and
identifies the pairs with the highest similarity. It compare the extracted paraphrase pairs
with a set of gold labels and computes the F1 score.
"""
def __init__(
self,
sentences_map: Dict[str, str],
duplicates_list: List[Tuple[str, str]] = None,
duplicates_dict: Dict[str, Dict[str, bool]] = None,
add_transitive_closure: bool = False,
query_chunk_size: int = 5000,
corpus_chunk_size: int = 100000,
max_pairs: int = 500000,
top_k: int = 100,
show_progress_bar: bool = False,
batch_size: int = 16,
name: str = "",
write_csv: bool = True,
):
"""
:param sentences_map: A dictionary that maps sentence-ids to sentences, i.e. sentences_map[id] => sentence.
:param duplicates_list: Duplicates_list is a list with id pairs [(id1, id2), (id1, id5)] that identifies the duplicates / paraphrases in the sentences_map
:param duplicates_dict: A default dictionary mapping [id1][id2] to true if id1 and id2 are duplicates. Must be symmetric, i.e., if [id1][id2] => True, then [id2][id1] => True.
:param add_transitive_closure: If true, it adds a transitive closure, i.e. if dup[a][b] and dup[b][c], then dup[a][c]
:param query_chunk_size: To identify the paraphrases, the cosine-similarity between all sentence-pairs will be computed. As this might require a lot of memory, we perform a batched computation. #query_batch_size sentences will be compared against up to #corpus_batch_size sentences. In the default setting, 5000 sentences will be grouped together and compared up-to against 100k other sentences.
:param corpus_chunk_size: The corpus will be batched, to reduce the memory requirement
:param max_pairs: We will only extract up to #max_pairs potential paraphrase candidates.
:param top_k: For each query, we extract the top_k most similar pairs and add it to a sorted list. I.e., for one sentence we cannot find more than top_k paraphrases
:param show_progress_bar: Output a progress bar
:param batch_size: Batch size for computing sentence embeddings
:param name: Name of the experiment
:param write_csv: Write results to CSV file
"""
self.sentences = []
self.ids = []
for id, sentence in sentences_map.items():
self.sentences.append(sentence)
self.ids.append(id)
self.name = name
self.show_progress_bar = show_progress_bar
self.batch_size = batch_size
self.query_chunk_size = query_chunk_size
self.corpus_chunk_size = corpus_chunk_size
self.max_pairs = max_pairs
self.top_k = top_k
self.duplicates = duplicates_dict if duplicates_dict is not None else defaultdict(lambda: defaultdict(bool))
if duplicates_list is not None:
for id1, id2 in duplicates_list:
if id1 in sentences_map and id2 in sentences_map:
self.duplicates[id1][id2] = True
self.duplicates[id2][id1] = True
# Add transitive closure
if add_transitive_closure:
self.duplicates = self.add_transitive_closure(self.duplicates)
positive_key_pairs = set()
for key1 in self.duplicates:
for key2 in self.duplicates[key1]:
if (
key1 in sentences_map
and key2 in sentences_map
and (self.duplicates[key1][key2] or self.duplicates[key2][key1])
):
positive_key_pairs.add(tuple(sorted([key1, key2])))
self.total_num_duplicates = len(positive_key_pairs)
if name:
name = "_" + name
self.csv_file: str = "paraphrase_mining_evaluation" + name + "_results.csv"
self.csv_headers = ["epoch", "steps", "precision", "recall", "f1", "threshold", "average_precision"]
self.write_csv = write_csv
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
if epoch != -1:
out_txt = f" after epoch {epoch}:" if steps == -1 else f" in epoch {epoch} after {steps} steps:"
else:
out_txt = ":"
logger.info("Paraphrase Mining Evaluation on " + self.name + " dataset" + out_txt)
# Compute embedding for the sentences
pairs_list = paraphrase_mining(
model,
self.sentences,
self.show_progress_bar,
self.batch_size,
self.query_chunk_size,
self.corpus_chunk_size,
self.max_pairs,
self.top_k,
)
logger.info("Number of candidate pairs: " + str(len(pairs_list)))
# Compute F1 score and Average Precision
n_extract = n_correct = 0
threshold = 0
best_f1 = best_recall = best_precision = 0
average_precision = 0
for idx in range(len(pairs_list)):
score, i, j = pairs_list[idx]
id1 = self.ids[i]
id2 = self.ids[j]
# Compute optimal threshold and F1-score
n_extract += 1
if self.duplicates[id1][id2] or self.duplicates[id2][id1]:
n_correct += 1
precision = n_correct / n_extract
recall = n_correct / self.total_num_duplicates
f1 = 2 * precision * recall / (precision + recall)
average_precision += precision
if f1 > best_f1:
best_f1 = f1
best_precision = precision
best_recall = recall
threshold = (pairs_list[idx][0] + pairs_list[min(idx + 1, len(pairs_list) - 1)][0]) / 2
average_precision = average_precision / self.total_num_duplicates
logger.info("Average Precision: {:.2f}".format(average_precision * 100))
logger.info("Optimal threshold: {:.4f}".format(threshold))
logger.info("Precision: {:.2f}".format(best_precision * 100))
logger.info("Recall: {:.2f}".format(best_recall * 100))
logger.info("F1: {:.2f}\n".format(best_f1 * 100))
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
if not os.path.isfile(csv_path):
with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(self.csv_headers)
writer.writerow([epoch, steps, best_precision, best_recall, best_f1, threshold, average_precision])
else:
with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow([epoch, steps, best_precision, best_recall, best_f1, threshold, average_precision])
return average_precision
@staticmethod
def add_transitive_closure(graph):
nodes_visited = set()
for a in list(graph.keys()):
if a not in nodes_visited:
connected_subgraph_nodes = set()
connected_subgraph_nodes.add(a)
# Add all nodes in the connected graph
neighbor_nodes_queue = list(graph[a])
while len(neighbor_nodes_queue) > 0:
node = neighbor_nodes_queue.pop(0)
if node not in connected_subgraph_nodes:
connected_subgraph_nodes.add(node)
neighbor_nodes_queue.extend(graph[node])
# Ensure transitivity between all nodes in the graph
connected_subgraph_nodes = list(connected_subgraph_nodes)
for i in range(len(connected_subgraph_nodes) - 1):
for j in range(i + 1, len(connected_subgraph_nodes)):
graph[connected_subgraph_nodes[i]][connected_subgraph_nodes[j]] = True
graph[connected_subgraph_nodes[j]][connected_subgraph_nodes[i]] = True
nodes_visited.add(connected_subgraph_nodes[i])
nodes_visited.add(connected_subgraph_nodes[j])
return graph
from . import SentenceEvaluator
import logging
import numpy as np
import os
import csv
from ..util import cos_sim
import torch
from sklearn.metrics import average_precision_score, ndcg_score
import tqdm
from typing import Optional
logger = logging.getLogger(__name__)
class RerankingEvaluator(SentenceEvaluator):
"""
This class evaluates a SentenceTransformer model for the task of re-ranking.
Given a query and a list of documents, it computes the score [query, doc_i] for all possible
documents and sorts them in decreasing order. Then, MRR@10, NDCG@10 and MAP is compute to measure the quality of the ranking.
:param samples: Must be a list and each element is of the form: {'query': '', 'positive': [], 'negative': []}. Query is the search query,
positive is a list of positive (relevant) documents, negative is a list of negative (irrelevant) documents.
"""
def __init__(
self,
samples,
at_k: int = 10,
name: str = "",
write_csv: bool = True,
similarity_fct=cos_sim,
batch_size: int = 64,
show_progress_bar: bool = False,
use_batched_encoding: bool = True,
mrr_at_k: Optional[int] = None,
):
self.samples = samples
self.name = name
if mrr_at_k is not None:
logger.warning(f"The `mrr_at_k` parameter has been deprecated; please use `at_k={mrr_at_k}` instead.")
self.at_k = mrr_at_k
else:
self.at_k = at_k
self.similarity_fct = similarity_fct
self.batch_size = batch_size
self.show_progress_bar = show_progress_bar
self.use_batched_encoding = use_batched_encoding
if isinstance(self.samples, dict):
self.samples = list(self.samples.values())
### Remove sample with empty positive / negative set
self.samples = [
sample for sample in self.samples if len(sample["positive"]) > 0 and len(sample["negative"]) > 0
]
self.csv_file = "RerankingEvaluator" + ("_" + name if name else "") + f"_results_@{self.at_k}.csv"
self.csv_headers = [
"epoch",
"steps",
"MAP",
"MRR@{}".format(self.at_k),
"NDCG@{}".format(self.at_k),
]
self.write_csv = write_csv
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
if epoch != -1:
if steps == -1:
out_txt = " after epoch {}:".format(epoch)
else:
out_txt = " in epoch {} after {} steps:".format(epoch, steps)
else:
out_txt = ":"
logger.info("RerankingEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
scores = self.compute_metrices(model)
mean_ap = scores["map"]
mean_mrr = scores["mrr"]
mean_ndcg = scores["ndcg"]
#### Some stats about the dataset
num_positives = [len(sample["positive"]) for sample in self.samples]
num_negatives = [len(sample["negative"]) for sample in self.samples]
logger.info(
"Queries: {} \t Positives: Min {:.1f}, Mean {:.1f}, Max {:.1f} \t Negatives: Min {:.1f}, Mean {:.1f}, Max {:.1f}".format(
len(self.samples),
np.min(num_positives),
np.mean(num_positives),
np.max(num_positives),
np.min(num_negatives),
np.mean(num_negatives),
np.max(num_negatives),
)
)
logger.info("MAP: {:.2f}".format(mean_ap * 100))
logger.info("MRR@{}: {:.2f}".format(self.at_k, mean_mrr * 100))
logger.info("NDCG@{}: {:.2f}".format(self.at_k, mean_ndcg * 100))
#### Write results to disc
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
output_file_exists = os.path.isfile(csv_path)
with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
writer = csv.writer(f)
if not output_file_exists:
writer.writerow(self.csv_headers)
writer.writerow([epoch, steps, mean_ap, mean_mrr, mean_ndcg])
return mean_ap
def compute_metrices(self, model):
return (
self.compute_metrices_batched(model)
if self.use_batched_encoding
else self.compute_metrices_individual(model)
)
def compute_metrices_batched(self, model):
"""
Computes the metrices in a batched way, by batching all queries and
all documents together
"""
all_mrr_scores = []
all_ndcg_scores = []
all_ap_scores = []
all_query_embs = model.encode(
[sample["query"] for sample in self.samples],
convert_to_tensor=True,
batch_size=self.batch_size,
show_progress_bar=self.show_progress_bar,
)
all_docs = []
for sample in self.samples:
all_docs.extend(sample["positive"])
all_docs.extend(sample["negative"])
all_docs_embs = model.encode(
all_docs, convert_to_tensor=True, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar
)
# Compute scores
query_idx, docs_idx = 0, 0
for instance in self.samples:
query_emb = all_query_embs[query_idx]
query_idx += 1
num_pos = len(instance["positive"])
num_neg = len(instance["negative"])
docs_emb = all_docs_embs[docs_idx : docs_idx + num_pos + num_neg]
docs_idx += num_pos + num_neg
if num_pos == 0 or num_neg == 0:
continue
pred_scores = self.similarity_fct(query_emb, docs_emb)
if len(pred_scores.shape) > 1:
pred_scores = pred_scores[0]
pred_scores_argsort = torch.argsort(-pred_scores) # Sort in decreasing order
pred_scores = pred_scores.cpu().tolist()
# Compute MRR score
is_relevant = [1] * num_pos + [0] * num_neg
mrr_score = 0
for rank, index in enumerate(pred_scores_argsort[0 : self.at_k]):
if is_relevant[index]:
mrr_score = 1 / (rank + 1)
break
all_mrr_scores.append(mrr_score)
# Compute NDCG score
all_ndcg_scores.append(ndcg_score([is_relevant], [pred_scores], k=self.at_k))
# Compute AP
all_ap_scores.append(average_precision_score(is_relevant, pred_scores))
mean_ap = np.mean(all_ap_scores)
mean_mrr = np.mean(all_mrr_scores)
mean_ndcg = np.mean(all_ndcg_scores)
return {"map": mean_ap, "mrr": mean_mrr, "ndcg": mean_ndcg}
def compute_metrices_individual(self, model):
"""
Embeds every (query, positive, negative) tuple individually.
Is slower than the batched version, but saves memory as only the
embeddings for one tuple are needed. Useful when you have
a really large test set
"""
all_mrr_scores = []
all_ndcg_scores = []
all_ap_scores = []
for instance in tqdm.tqdm(self.samples, disable=not self.show_progress_bar, desc="Samples"):
query = instance["query"]
positive = list(instance["positive"])
negative = list(instance["negative"])
if len(positive) == 0 or len(negative) == 0:
continue
docs = positive + negative
is_relevant = [1] * len(positive) + [0] * len(negative)
query_emb = model.encode(
[query], convert_to_tensor=True, batch_size=self.batch_size, show_progress_bar=False
)
docs_emb = model.encode(docs, convert_to_tensor=True, batch_size=self.batch_size, show_progress_bar=False)
pred_scores = self.similarity_fct(query_emb, docs_emb)
if len(pred_scores.shape) > 1:
pred_scores = pred_scores[0]
pred_scores_argsort = torch.argsort(-pred_scores) # Sort in decreasing order
pred_scores = pred_scores.cpu().tolist()
# Compute MRR score
mrr_score = 0
for rank, index in enumerate(pred_scores_argsort[0 : self.at_k]):
if is_relevant[index]:
mrr_score = 1 / (rank + 1)
break
all_mrr_scores.append(mrr_score)
# Compute NDCG score
all_ndcg_scores.append(ndcg_score([is_relevant], [pred_scores], k=self.at_k))
# Compute AP
all_ap_scores.append(average_precision_score(is_relevant, pred_scores))
mean_ap = np.mean(all_ap_scores)
mean_mrr = np.mean(all_mrr_scores)
mean_ndcg = np.mean(all_ndcg_scores)
return {"map": mean_ap, "mrr": mean_mrr, "ndcg": mean_ndcg}
class SentenceEvaluator:
"""
Base class for all evaluators
Extend this class and implement __call__ for custom evaluators.
"""
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
"""
This is called during training to evaluate the model.
It returns a score for the evaluation with a higher score indicating a better result.
:param model:
the model to evaluate
:param output_path:
path where predictions and metrics are written to
:param epoch
the epoch where the evaluation takes place.
This is used for the file prefixes.
If this is -1, then we assume evaluation on test data.
:param steps
the steps in the current epoch at time of the evaluation.
This is used for the file prefixes.
If this is -1, then we assume evaluation at the end of the epoch.
:return: a score for the evaluation with a higher score indicating a better result
"""
pass
from . import SentenceEvaluator
from typing import Iterable
class SequentialEvaluator(SentenceEvaluator):
"""
This evaluator allows that multiple sub-evaluators are passed. When the model is evaluated,
the data is passed sequentially to all sub-evaluators.
All scores are passed to 'main_score_function', which derives one final score value
"""
def __init__(self, evaluators: Iterable[SentenceEvaluator], main_score_function=lambda scores: scores[-1]):
self.evaluators = evaluators
self.main_score_function = main_score_function
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
scores = []
for evaluator in self.evaluators:
scores.append(evaluator(model, output_path, epoch, steps))
return self.main_score_function(scores)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment