sat.py

"""
Similarity of Semantic Relations
https://arxiv.org/pdf/cs/0608100.pdf

SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374
multiple-choice analogy questions; 5 choices per question.

Homepage: https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art)
"""
import inspect
import lm_eval.datasets.sat_analogies.sat_analogies
from lm_eval.base import MultipleChoiceTask


_CITATION = """
@article{article,
    author = {Turney, Peter},
    year = {2006},
    month = {09},
    pages = {379-416},
    title = {Similarity of Semantic Relations},
    volume = {32},
    journal = {Computational Linguistics},
    doi = {10.1162/coli.2006.32.3.379}
}
"""


class SATAnalogies(MultipleChoiceTask):
    VERSION = 0
    DATASET_PATH = inspect.getfile(lm_eval.datasets.sat_analogies.sat_analogies)
    DATASET_NAME = None

    def __init__(self, data_dir: str):
        """
        SAT Analog Questions is not publicly available. You must request the data
        by emailing Peter Turney and then download it to a local directory path
        which should be passed into the `data_dir` arg.
        """
        super().__init__(data_dir=data_dir)

    def has_training_docs(self):
        return False

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return False

    def training_docs(self):
        return []

    def validation_docs(self):
        return map(self._process_doc, self.dataset["validation"])

    def test_docs(self):
        return []

    def _process_doc(self, doc):
        return {
            "source": doc["source"],
            "query": doc["stem"].split(" ")[:2],
            "choices": [
                "{} is to {}".format(*c.split(" ")[:2]) for c in doc["choices"]
            ],
            "gold": ["a", "b", "c", "d", "e"].index(doc["solution"].strip()),
        }

    def doc_to_text(self, doc):
        return "{} is to {} as".format(*doc["query"])

    def should_decontaminate(self):
        return True

    def doc_to_decontamination_query(self, doc):
        return doc["source"] + "\n" + " ".join(doc["query"])