asdiv.py 2.85 KB
Newer Older
rokosbasilisk's avatar
rokosbasilisk committed
1
2
3
4
"""
ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers
https://arxiv.org/abs/2106.15772

5
6
7
8
9
10
11
12
13
14
15
ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language
patterns and problem types) English math word problem (MWP) corpus for evaluating
the capability of various MWP solvers. Existing MWP corpora for studying AI progress
remain limited either in language usage patterns or in problem types. We thus present
a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem
types taught in elementary school. Each MWP is annotated with its problem type and grade
level (for indicating the level of difficulty).

NOTE: We currently ignore formulas for answer generation.

Homepage: https://github.com/chaochun/nlu-asdiv-dataset
rokosbasilisk's avatar
rokosbasilisk committed
16
"""
Jonathan Tow's avatar
Jonathan Tow committed
17
18
19
20
import inspect
import lm_eval.datasets.asdiv.asdiv
from lm_eval.base import rf, Task
from lm_eval.metrics import mean
rokosbasilisk's avatar
rokosbasilisk committed
21
22


23
24
25
26
27
28
29
30
31
32
33
34
_CITATION = """
@misc{miao2021diverse,
    title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
    author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
    year={2021},
    eprint={2106.15772},
    archivePrefix={arXiv},
    primaryClass={cs.AI}
}
"""


rokosbasilisk's avatar
rokosbasilisk committed
35
36
class Asdiv(Task):
    VERSION = 0
Jonathan Tow's avatar
Jonathan Tow committed
37
    DATASET_PATH = inspect.getfile(lm_eval.datasets.asdiv.asdiv)
rokosbasilisk's avatar
rokosbasilisk committed
38
39
40

    def has_training_docs(self):
        return False
Jonathan Tow's avatar
Jonathan Tow committed
41

rokosbasilisk's avatar
rokosbasilisk committed
42
43
44
45
46
47
48
49
50
    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return False

    def training_docs(self):
        raise NotImplementedError("This dataset has no training docs")

Jonathan Tow's avatar
Jonathan Tow committed
51
52
53
    def validation_docs(self):
        return self.dataset["validation"]

rokosbasilisk's avatar
rokosbasilisk committed
54
55
56
    def test_docs(self):
        raise NotImplementedError("This dataset has no test docs")

bzantium's avatar
bzantium committed
57
58
59
    def fewshot_context(
        self, doc, num_fewshot, provide_description=None, rnd=None, description=None
    ):
rokosbasilisk's avatar
rokosbasilisk committed
60
        assert num_fewshot == 0, "ASDiv is intended only for the zero-shot setting."
61
        return super().fewshot_context(
bzantium's avatar
bzantium committed
62
            doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
63
        )
rokosbasilisk's avatar
rokosbasilisk committed
64
65
66

    def doc_to_text(self, doc):
        # TODO: add solution-type
bzantium's avatar
bzantium committed
67
68
69
70
71
72
73
        return doc["body"] + "\n" + "Question:" + doc["question"] + "\n" + "Answer:"

    def should_decontaminate(self):
        return True

    def doc_to_decontamination_query(self, doc):
        return doc["body"] + " " + doc["question"]
rokosbasilisk's avatar
rokosbasilisk committed
74
75
76
77

    def doc_to_target(self, doc):
        # TODO: add formula

bzantium's avatar
bzantium committed
78
        answer = doc["answer"].split(" (")[0]
Leo Gao's avatar
Leo Gao committed
79
        return " " + answer
rokosbasilisk's avatar
rokosbasilisk committed
80
81
82
83

    def construct_requests(self, doc, ctx):
        ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
        return ll, is_greedy
bzantium's avatar
bzantium committed
84

rokosbasilisk's avatar
rokosbasilisk committed
85
86
87
    def process_results(self, doc, results):
        ll, is_greedy = results

bzantium's avatar
bzantium committed
88
89
        return {"acc": int(is_greedy)}

rokosbasilisk's avatar
rokosbasilisk committed
90
    def aggregation(self):
bzantium's avatar
bzantium committed
91
        return {"acc": mean}
rokosbasilisk's avatar
rokosbasilisk committed
92
93

    def higher_is_better(self):
bzantium's avatar
bzantium committed
94
        return {"acc": True}