sat.py 2.08 KB
Newer Older
1
2
3
4
5
6
7
8
"""
Similarity of Semantic Relations
https://arxiv.org/pdf/cs/0608100.pdf

SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374
multiple-choice analogy questions; 5 choices per question.

Homepage: https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art)
9
"""
Jonathan Tow's avatar
Jonathan Tow committed
10
11
import inspect
import lm_eval.datasets.sat_analogies.sat_analogies
12
from lm_eval.base import MultipleChoiceTask
13

14
15

_CITATION = """
16
17
18
19
20
21
22
23
24
25
26
@article{article,
    author = {Turney, Peter},
    year = {2006},
    month = {09},
    pages = {379-416},
    title = {Similarity of Semantic Relations},
    volume = {32},
    journal = {Computational Linguistics},
    doi = {10.1162/coli.2006.32.3.379}
}
"""
27
28


Jonathan Tow's avatar
Jonathan Tow committed
29
class SATAnalogies(MultipleChoiceTask):
Leo Gao's avatar
Leo Gao committed
30
    VERSION = 0
Jonathan Tow's avatar
Jonathan Tow committed
31
32
    DATASET_PATH = inspect.getfile(lm_eval.datasets.sat_analogies.sat_analogies)
    DATASET_NAME = None
33

Jonathan Tow's avatar
Jonathan Tow committed
34
35
36
37
38
39
40
    def __init__(self, data_dir: str):
        """
        SAT Analog Questions is not publicly available. You must request the data
        by emailing Peter Turney and then download it to a local directory path
        which should be passed into the `data_dir` arg.
        """
        super().__init__(data_dir=data_dir)
41
42
43
44
45
46
47
48
49
50
51
52
53
54

    def has_training_docs(self):
        return False

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return False

    def training_docs(self):
        return []

    def validation_docs(self):
Jon Tow's avatar
Jon Tow committed
55
        return map(self._process_doc, self.dataset["validation"])
56

Jonathan Tow's avatar
Jonathan Tow committed
57
58
    def test_docs(self):
        return []
59

Jon Tow's avatar
Jon Tow committed
60
    def _process_doc(self, doc):
Jonathan Tow's avatar
Jonathan Tow committed
61
        return {
bzantium's avatar
bzantium committed
62
63
64
65
66
67
            "source": doc["source"],
            "query": doc["stem"].split(" ")[:2],
            "choices": [
                "{} is to {}".format(*c.split(" ")[:2]) for c in doc["choices"]
            ],
            "gold": ["a", "b", "c", "d", "e"].index(doc["solution"].strip()),
Jonathan Tow's avatar
Jonathan Tow committed
68
        }
69
70

    def doc_to_text(self, doc):
bzantium's avatar
bzantium committed
71
72
73
74
75
76
77
        return "{} is to {} as".format(*doc["query"])

    def should_decontaminate(self):
        return True

    def doc_to_decontamination_query(self, doc):
        return doc["source"] + "\n" + " ".join(doc["query"])