Merge pull request #143 from EleutherAI/add_headqa_mathqa

add headqa and mathqa datasets

Merge pull request #143 from EleutherAI/add_headqa_mathqa
add headqa and mathqa datasets
d194d65c · Leo Gao · GitHub · e5a9de80 · cc27e38a · d194d65c
Unverified Commit d194d65c authored Feb 12, 2021 by Leo Gao Committed by GitHub Feb 12, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 119 additions and 1 deletion

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +4 -1

lm_eval/tasks/headqa.py lm_eval/tasks/headqa.py +64 -0

lm_eval/tasks/mathqa.py lm_eval/tasks/mathqa.py +51 -0

No files found.
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -21,7 +21,8 @@ from . import pubmedqa
 from . import sciq
 from . import webqs
 from . import qa4mre
-
+from . import headqa
+from . import mathqa

 TASK_REGISTRY = {
    # GLUE
@@ -67,6 +68,8 @@ TASK_REGISTRY = {
    # "squad": squad.SQuAD, # not implemented yet
    "race": race.RACE,
    # "naturalqs": naturalqs.NaturalQs, # not implemented yet
+    "headqa": headqa.HeadQA,
+    "mathqa": mathqa.MathQA,
    "webqs": webqs.WebQs,
    "wsc273": wsc273.WinogradSchemaChallenge273,
    "winogrande": winogrande.Winogrande,

--- a/lm_eval/tasks/headqa.py
+++ b/lm_eval/tasks/headqa.py
+from . common import HFTask
+from lm_eval.base import mean, rf
+
+class HeadQA(HFTask):
+    DATASET_PATH = "head_qa"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def fewshot_description(self):
+        # TODO: figure out description
+        return ""
+
+    def doc_to_text(self, doc):
+        return "Question: " + doc['qtext'] + '\nAnswer:'
+
+    def doc_to_target(self, doc):
+        # this picks one answer to be the "correct" one, despite sometimes 
+        # multiple correct answers being possible.
+        # TODO: make sure we're actually handling multi-answer correctly
+        return " " + doc['answers'][0]['atext']
+        
+    def _remove_prefixes(self, aliases):
+        # Optimization: Remove any alias that has a strict prefix elsewhere in the list
+        # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
+        aliases.sort()
+        ret = [aliases[0]]
+        for alias in aliases[1:]:
+            if not alias.startswith(ret[-1]):
+                ret.append(alias)
+
+        return ret
+        
+
+    def construct_requests(self, doc, ctx):
+
+        ret = []
+        atexts = [x['atext'] for x in doc['answers']]
+        for alias in self._remove_prefixes(atexts):
+            _, is_prediction = rf.loglikelihood(ctx, " " + alias)
+            ret.append(is_prediction)
+        return ret
+
+    def process_results(self, doc, results):
+        return {
+            "acc": float(any(results))
+        }
+
+    def aggregation(self):
+        return {
+            "acc": mean,
+        }
+
+    def higher_is_better(self):
+        return {
+            "acc": True
+        }
\ No newline at end of file
--- a/lm_eval/tasks/mathqa.py
+++ b/lm_eval/tasks/mathqa.py
+from . common import HFTask
+from lm_eval.base import mean, rf, MultipleChoiceTask
+import re
+
+class MathQA(HFTask, MultipleChoiceTask):
+    DATASET_PATH = "math_qa"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def _convert_standard(self, doc):
+
+        answer_idx = ['a', 'b', 'c', 'd', 'e'].index(doc['correct'])
+        choices = [c[4:].rstrip(" ,") for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc['options'])]
+
+        out_doc = {
+            "query": "Question: " + doc['Problem'] +"\nAnswer:",
+            "choices": choices,
+            "gold": answer_idx,
+        }
+        return out_doc
+
+    def _load_docs(self, docs):
+        for record in docs:
+            yield self._convert_standard(record)
+
+    def training_docs(self):
+        docs = super().training_docs()
+        return self._load_docs(docs)
+
+    def validation_docs(self):
+        docs = super().validation_docs()
+        return self._load_docs(docs)
+
+    def test_docs(self):
+        docs = super().test_docs()
+        return self._load_docs(docs)
+
+    def fewshot_description(self):
+        # TODO: figure out description
+        return ""
+
+    def doc_to_text(self, doc):
+        return doc["query"]