Merge branch 'master' into triviaqa_evaluation

c8032a1a · Leo Gao · GitHub · b2b5a122 · 0f30237a · c8032a1a
Unverified Commit c8032a1a authored Jan 30, 2021 by Leo Gao Committed by GitHub Jan 30, 2021
Showing with 46 additions and 17 deletions

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +3 -1

lm_eval/tasks/lambada.py lm_eval/tasks/lambada.py +0 -1

lm_eval/tasks/race.py lm_eval/tasks/race.py +40 -15

requirements.txt requirements.txt +3 -0

No files found.
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -14,9 +14,11 @@ from . import naturalqs
 from . import sat
 from . import arithmetic
 from . import lambada
+from . import race 
 from . import piqa
 from . import triviaqa

+
 TASK_REGISTRY = {
    # GLUE
    "cola": glue.CoLA,
@@ -51,7 +53,7 @@ TASK_REGISTRY = {
    # "openbookqa": openbookqa.OpenBookQA, # not implemented yet
    # "sat": sat.SATAnalogies, # not implemented yet
    # "squad": squad.SQuAD, # not implemented yet
-    # "race": race.RACE, # not implemented yet
+    "race": race.RACE,
    # "naturalqs": naturalqs.NaturalQs, # not implemented yet
    # "webqs": webqs.WebQs, # not implemented yet
    # "wsc273": wsc273.WinogradSchemaChallenge273, # not implemented yet

--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
 from lm_eval.base import Dataset, rf, mean
 from lm_eval.utils import sh
 import json
-import requests
 import math
 from best_download import download_file


--- a/lm_eval/tasks/race.py
+++ b/lm_eval/tasks/race.py
-from . common import HFTask
-from ..utils_stream import X, each, apply, join, filt, one
 import collections
 import datasets
+import numpy as np
+from lm_eval.base import rf, mean
+from . common import HFTask
+from ..utils_stream import each


 class RACE(HFTask):
@@ -9,6 +11,7 @@ class RACE(HFTask):
    DATASET_NAME = "high"

    cache = {}
+    letter_to_num = {'A': 0, 'B': 1, 'C': 2, 'D': 3}

    def has_training_docs(self):
        return True
@@ -54,13 +57,26 @@ class RACE(HFTask):
        # TODO: figure out description
        return ""

+    @classmethod
+    def get_answer_option(cls, problem):
+        answer = cls.letter_to_num[problem['answer']]
+        return problem['options'][answer]
+
+    @classmethod
+    def last_problem(cls, doc):
+        return doc['problems'][-1]
+
    def doc_to_text(self, doc):
-        # TODO: implement
-        pass
+        text = 'Article: ' + doc['article'] + '\n\n'
+        for problem in doc['problems'][:-1]:
+            question = 'Q: ' + problem['question'] + '\n\n'
+            answer = 'A: ' + self.get_answer_option(problem) + '\n\n'
+            text += question + answer
+        text += 'Q: ' + self.last_problem(doc)['question'] + '\n\n' + 'A:'
+        return text

    def doc_to_target(self, doc):
-        # TODO: implement
-        pass
+        return " " + self.get_answer_option(self.last_problem(doc))

    def construct_requests(self, doc, ctx):
        """ Uses RequestFactory to construct Requests and returns an iterable of 
@@ -73,9 +89,13 @@ class RACE(HFTask):
            language description, as well as the few shot examples, and the question
            part of the document for `doc`. 
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
-    
+        problem = self.last_problem(doc)
+        ll_choices = [
+            rf.loglikelihood(ctx, " " + problem['options'][i])[0]
+            for i in range(4)
+        ]
+        return ll_choices
+
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a 
        dict where keys are the names of submetrics and values are the values of 
@@ -86,8 +106,11 @@ class RACE(HFTask):
        :param results:
            The results of the requests created in construct_requests.
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        gold = self.letter_to_num[self.last_problem(doc)['answer']]
+        pred = np.argmax(results)
+        return {
+            "acc": int(pred == gold)
+        }

    def aggregation(self):
        """
@@ -95,8 +118,9 @@ class RACE(HFTask):
            A dictionary where keys are the names of submetrics and values are 
            functions that aggregate a list of metrics
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        return {
+            "acc": mean
+        }

    def higher_is_better(self):
        """
@@ -104,5 +128,6 @@ class RACE(HFTask):
            A dictionary where keys are the names of submetrics and values are 
            whether a higher value of the submetric is better
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
+        return {
+            "acc": True
+        }
--- a/requirements.txt
+++ b/requirements.txt
 black==20.8b1
+best_download>=0.0.5
+datasets>=1.2.1
 click>=7.1
+scikit-learn>=0.24.1
 torch>=1.7
 transformers>=4.1
\ No newline at end of file