evaluation_translation_matching.py

"""
Given a dataset with parallel sentences, one "english" column and one "non_english" column, this script evaluates a model on the translation task.
Given a sentence in the "english" column, the model should find the correct translation in the "non_english" column, based on just the embeddings.

It then computes an accuracy over all possible source sentences src_i. Equivalently, it computes also the accuracy for the other direction.
A high accuracy score indicates that the model is able to find the correct translation out of a large pool with sentences.

Good options for datasets are:
* sentence-transformers/parallel-sentences-wikimatrix
* sentence-transformers/parallel-sentences-tatoeba
* sentence-transformers/parallel-sentences-talks

As these have development sets.

Usage:
python examples/evaluation/evaluation_translation_matching.py [model_name_or_path] [dataset_name] [subset1] [subset2] ...

For example:
python examples/evaluation/evaluation_translation_matching.py distiluse-base-multilingual-cased sentence-transformers/parallel-sentences-tatoeba en-ar en-de en-nl
"""

import logging
import sys

from datasets import load_dataset
from sentence_transformers import SentenceTransformer, evaluation

# Set the log level to INFO to get more information
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)

model_name = sys.argv[1]
dataset_name = sys.argv[2]
subsets = sys.argv[3:]
inference_batch_size = 32

model = SentenceTransformer(model_name)

for subset in subsets:
    dataset = load_dataset(dataset_name, subset)
    datasets = {}
    if dataset.column_names == ["train"]:
        num_samples = min(5000, len(dataset["train"]))
        datasets[f"train[:{num_samples}]"].append(dataset["train"].select(range(num_samples)))
    else:
        for split, sub_dataset in dataset.items():
            if split != "train":
                datasets[split] = sub_dataset

    for split, sub_dataset in datasets.items():
        logging.info(f"{dataset_name}, subset={subset}, split={split}, num_samples={len(sub_dataset)}")
        translation_evaluator = evaluation.TranslationEvaluator(
            sub_dataset["english"],
            sub_dataset["non_english"],
            name=f"{dataset_name}-{subset}-{split}",
            batch_size=inference_batch_size,
        )
        translation_evaluator(model)