Merge branch 'master' of github.com:EleutherAI/lm_evaluation_harness

2a1d7d87 · Leo Gao · b1f7284e · a55a5c52 · 2a1d7d87 · 2a1d7d87
Commit 2a1d7d87 authored Feb 04, 2021 by Leo Gao
Hide whitespace changes
Inline Side-by-side

Showing with 50 additions and 15 deletions

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +2 -2

lm_eval/tasks/arc.py lm_eval/tasks/arc.py +45 -10

lm_eval/tasks/arithmetic.py lm_eval/tasks/arithmetic.py +3 -3

No files found.
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -47,8 +47,8 @@ TASK_REGISTRY = {
    "piqa": piqa.PiQA,

    #"triviaqa": triviaqa.TriviaQA,
-    # "arc_easy": arc.ARCEasy, # not implemented yet
-    # "arc_challenge": arc.ARCChallenge, # not implemented yet
+    "arc_easy": arc.ARCEasy,
+    "arc_challenge": arc.ARCChallenge,
    # "quac": quac.QuAC, # not implemented yet
    "hellaswag": hellaswag.HellaSwag, # not implemented yet
    # "openbookqa": openbookqa.OpenBookQA, # not implemented yet

--- a/lm_eval/tasks/arc.py
+++ b/lm_eval/tasks/arc.py
+import numpy as np
+from lm_eval.base import rf, mean
 from . common import HFTask

+
 class ARCEasy(HFTask):
    DATASET_PATH = "ai2_arc"
    DATASET_NAME = "ARC-Easy"

+    letter_to_num = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
+
+    def __init__(self):
+        super().__init__()
+        self.data = self.__clean_data()
+
+    def __clean_data(self):
+        """ Resolves various edge cases in the unprocessed HF ARC dataset. """
+        # NOTE: Some `doc["answerKey"]`s are in numeric string format being one
+        # of {'1', '2', '3', '4', '5'}. We map them back to letters.
+        num_to_letter = {'1': 'A', '2': 'B', '3': 'C', '4': 'D', '5': 'E'}
+        result = {}
+        for split, data in self.data.items():
+            result[split] = []
+            for doc in data:
+                # Ensure all `answerKey`s and `label`s are in letter format.
+                doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
+                doc["choices"]["label"] = [
+                    num_to_letter.get(label, label) for label in doc["choices"]["label"]
+                ]
+                result[split].append(doc)
+        return result
+
    def has_training_docs(self):
        return True

@@ -21,7 +47,8 @@ class ARCEasy(HFTask):
        return "Question: " + doc['question'] + '\nAnswer:'

    def doc_to_target(self, doc):
-        return " " + doc['choices']['text'][doc['choices']['label'].index(doc['answerKey'])]
+        index = self.letter_to_num[doc["answerKey"]]
+        return " " + doc['choices']['text'][index]

    def construct_requests(self, doc, ctx):
        """ Uses RequestFactory to construct Requests and returns an iterable of 
@@ -34,9 +61,11 @@ class ARCEasy(HFTask):
            language description, as well as the few shot examples, and the question
            part of the document for `doc`. 
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
-    
+        ll_choices = []
+        for choice in doc["choices"]["text"]:
+            ll_choices.append(rf.loglikelihood(ctx, " " + choice)[0])
+        return ll_choices
+
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a 
        dict where keys are the names of submetrics and values are the values of 
@@ -47,8 +76,11 @@ class ARCEasy(HFTask):
        :param results:
            The results of the requests created in construct_requests.
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        gold = self.letter_to_num[doc["answerKey"]]
+        pred = np.argmax(results)
+        return {
+            "acc": pred == gold
+        }

    def aggregation(self):
        """
@@ -56,8 +88,9 @@ class ARCEasy(HFTask):
            A dictionary where keys are the names of submetrics and values are 
            functions that aggregate a list of metrics
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        return {
+            "acc": mean
+        }

    def higher_is_better(self):
        """
@@ -65,8 +98,10 @@ class ARCEasy(HFTask):
            A dictionary where keys are the names of submetrics and values are 
            whether a higher value of the submetric is better
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        return {
+            "acc": True
+        }
+

 class ARCChallenge(ARCEasy):
    DATASET_PATH = "ai2_arc"

--- a/lm_eval/tasks/arithmetic.py
+++ b/lm_eval/tasks/arithmetic.py
@@ -32,7 +32,7 @@ class Arithmetic(Task):
        self._docs = [self.load_doc(json.loads(line)) for line in jsons]

    def has_training_docs(self):
-        return True
+        return False

    def has_validation_docs(self):
        return True
@@ -41,10 +41,10 @@ class Arithmetic(Task):
        return False

    def training_docs(self):
-        return self._docs
+        return NotImplemented

    def validation_docs(self):
-        return self._docs[:100]
+        return self._docs

    def test_docs(self):
        return NotImplemented