arc.py 3.53 KB
Newer Older
Jonathan Tow's avatar
Jonathan Tow committed
1
2
import numpy as np
from lm_eval.base import rf, mean
3
from . common import HFTask
Leo Gao's avatar
Leo Gao committed
4

Jonathan Tow's avatar
Jonathan Tow committed
5

6
class ARCEasy(HFTask):
Leo Gao's avatar
Leo Gao committed
7
8
    DATASET_PATH = "ai2_arc"
    DATASET_NAME = "ARC-Easy"
Leo Gao's avatar
Leo Gao committed
9

Jonathan Tow's avatar
Jonathan Tow committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
    letter_to_num = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}

    def __init__(self):
        super().__init__()
        self.data = self.__clean_data()

    def __clean_data(self):
        """ Resolves various edge cases in the unprocessed HF ARC dataset. """
        # NOTE: Some `doc["answerKey"]`s are in numeric string format being one
        # of {'1', '2', '3', '4', '5'}. We map them back to letters.
        num_to_letter = {'1': 'A', '2': 'B', '3': 'C', '4': 'D', '5': 'E'}
        result = {}
        for split, data in self.data.items():
            result[split] = []
            for doc in data:
                # Ensure all `answerKey`s and `label`s are in letter format.
                doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
                doc["choices"]["label"] = [
                    num_to_letter.get(label, label) for label in doc["choices"]["label"]
                ]
                result[split].append(doc)
        return result

Leo Gao's avatar
Leo Gao committed
33
34
35
36
37
38
39
40
41
42
43
44
45
    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
        # TODO: figure out description
        return ""

46
47
48
49
    def doc_to_text(self, doc):
        return "Question: " + doc['question'] + '\nAnswer:'

    def doc_to_target(self, doc):
Jonathan Tow's avatar
Jonathan Tow committed
50
51
        index = self.letter_to_num[doc["answerKey"]]
        return " " + doc['choices']['text'][index]
Leo Gao's avatar
Leo Gao committed
52

Leo Gao's avatar
Leo Gao committed
53
54
55
56
57
58
59
60
61
62
63
    def construct_requests(self, doc, ctx):
        """ Uses RequestFactory to construct Requests and returns an iterable of 
        Requests which will be sent to the LM.

        :param doc:
            The document as returned from training_docs, validation_docs, or test_docs.
        :param ctx: str
            The context string, generated by fewshot_context. This includes the natural 
            language description, as well as the few shot examples, and the question
            part of the document for `doc`. 
        """
Jonathan Tow's avatar
Jonathan Tow committed
64
65
66
67
68
        ll_choices = []
        for choice in doc["choices"]["text"]:
            ll_choices.append(rf.loglikelihood(ctx, " " + choice)[0])
        return ll_choices

Leo Gao's avatar
Leo Gao committed
69
70
71
72
73
74
75
76
77
78
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a 
        dict where keys are the names of submetrics and values are the values of 
        the metric for that one document

        :param doc:
            The document as returned from training_docs, validation_docs, or test_docs.
        :param results:
            The results of the requests created in construct_requests.
        """
Jonathan Tow's avatar
Jonathan Tow committed
79
80
81
82
83
        gold = self.letter_to_num[doc["answerKey"]]
        pred = np.argmax(results)
        return {
            "acc": pred == gold
        }
Leo Gao's avatar
Leo Gao committed
84
85
86
87
88
89
90

    def aggregation(self):
        """
        :returns: {str: [float] -> float}
            A dictionary where keys are the names of submetrics and values are 
            functions that aggregate a list of metrics
        """
Jonathan Tow's avatar
Jonathan Tow committed
91
92
93
        return {
            "acc": mean
        }
Leo Gao's avatar
Leo Gao committed
94
95
96
97
98
99
100

    def higher_is_better(self):
        """
        :returns: {str: bool}
            A dictionary where keys are the names of submetrics and values are 
            whether a higher value of the submetric is better
        """
Jonathan Tow's avatar
Jonathan Tow committed
101
102
103
104
        return {
            "acc": True
        }

Leo Gao's avatar
Leo Gao committed
105
106

class ARCChallenge(ARCEasy):
Leo Gao's avatar
Leo Gao committed
107
    DATASET_PATH = "ai2_arc"
Leo Gao's avatar
Leo Gao committed
108
    DATASET_NAME = "ARC-Challenge"