add asdiv task

b0a12310 · rokosbasilisk · f16e8b5c · b0a12310 · b0a12310 · b0a12310
Commit b0a12310 authored Dec 26, 2021 by rokosbasilisk
9 changed files
--- a/eval.sh
+++ b/eval.sh
+python main.py --model gpt2 --model_args pretrained=EleutherAI/gpt-neo-125M --device cuda:0 --tasks math_asdiv
--- a/lm_eval.egg-info/PKG-INFO
+++ b/lm_eval.egg-info/PKG-INFO
--- a/lm_eval.egg-info/SOURCES.txt
+++ b/lm_eval.egg-info/SOURCES.txt
+LICENSE.md
+README.md
+setup.py
+lm_eval/__init__.py
+lm_eval/base.py
+lm_eval/evaluator.py
+lm_eval/metrics.py
+lm_eval/utils.py
+lm_eval.egg-info/PKG-INFO
+lm_eval.egg-info/SOURCES.txt
+lm_eval.egg-info/dependency_links.txt
+lm_eval.egg-info/requires.txt
+lm_eval.egg-info/top_level.txt
+lm_eval/models/__init__.py
+lm_eval/models/dummy.py
+lm_eval/models/gpt2.py
+lm_eval/models/gpt3.py
+lm_eval/tasks/__init__.py
+lm_eval/tasks/anli.py
+lm_eval/tasks/arc.py
+lm_eval/tasks/arithmetic.py
+lm_eval/tasks/blimp.py
+lm_eval/tasks/cbt.py
+lm_eval/tasks/common.py
+lm_eval/tasks/coqa.py
+lm_eval/tasks/drop.py
+lm_eval/tasks/glue.py
+lm_eval/tasks/headqa.py
+lm_eval/tasks/hellaswag.py
+lm_eval/tasks/hendrycks_ethics.py
+lm_eval/tasks/hendrycks_math.py
+lm_eval/tasks/hendrycks_test.py
+lm_eval/tasks/lambada.py
+lm_eval/tasks/lambada_cloze.py
+lm_eval/tasks/lambada_multilingual.py
+lm_eval/tasks/logiqa.py
+lm_eval/tasks/mathqa.py
+lm_eval/tasks/mc_taco.py
+lm_eval/tasks/mutual.py
+lm_eval/tasks/naturalqs.py
+lm_eval/tasks/openbookqa.py
+lm_eval/tasks/pile.py
+lm_eval/tasks/piqa.py
+lm_eval/tasks/prost.py
+lm_eval/tasks/pubmedqa.py
+lm_eval/tasks/qa4mre.py
+lm_eval/tasks/quac.py
+lm_eval/tasks/race.py
+lm_eval/tasks/sat.py
+lm_eval/tasks/sciq.py
+lm_eval/tasks/squad.py
+lm_eval/tasks/storycloze.py
+lm_eval/tasks/superglue.py
+lm_eval/tasks/translation.py
+lm_eval/tasks/triviaqa.py
+lm_eval/tasks/truthfulqa.py
+lm_eval/tasks/unscramble.py
+lm_eval/tasks/webqs.py
+lm_eval/tasks/wikitext.py
+lm_eval/tasks/winogrande.py
+lm_eval/tasks/wsc273.py
+scripts/__init__.py
+scripts/cost_estimate.py
+scripts/fewshot_description_experiment.py
+scripts/get_prompts.py
+scripts/make_gpt2_test_cases.py
+scripts/make_table_tasks.py
+scripts/write_out.py
+scripts/clean_training_data/__init__.py
+scripts/clean_training_data/archiver.py
+scripts/clean_training_data/generate_13_grams.py
+scripts/clean_training_data/janitor.py
+scripts/clean_training_data/process_sorted_buckets.py
+scripts/clean_training_data/sort_13_gram_buckets.py
\ No newline at end of file
--- a/lm_eval.egg-info/dependency_links.txt
+++ b/lm_eval.egg-info/dependency_links.txt
--- a/lm_eval.egg-info/requires.txt
+++ b/lm_eval.egg-info/requires.txt
+black
+best_download>=0.0.6
+datasets==1.15.1
+click>=7.1
+scikit-learn>=0.24.1
+torch>=1.7
+transformers>=4.1
+sqlitedict==1.6.0
+pytablewriter==0.58.0
+sacrebleu==1.5.0
+rouge-score==0.0.4
+bleurt@ https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+pycountry==20.7.3
+numexpr==2.7.2
+lm_dataformat==0.0.20
+pytest==6.2.3
+pybind11==2.6.2
+tqdm-multiprocess==0.0.11
+zstandard==0.15.2
+jsonlines==2.0.0
+mock==4.0.3
+openai==0.6.4
+jieba==0.42.1
+nagisa==0.2.7
--- a/lm_eval.egg-info/top_level.txt
+++ b/lm_eval.egg-info/top_level.txt
+lm_eval
+scripts
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -45,6 +45,7 @@ from . import lambada_multilingual
 from . import mutual
 from . import truthfulqa
 from . import blimp
+from . import asdiv
 ########################################
 # Translation tasks
@@ -164,6 +165,7 @@ TASK_REGISTRY = {
    "math_num_theory": hendrycks_math.MathNumberTheory,
    "math_prealgebra": hendrycks_math.MathPrealgebra,
    "math_precalc": hendrycks_math.MathPrecalculus,
+    "math_asdiv": asdiv.Asdiv,
    # arithmetic
    "arithmetic_2da": arithmetic.Arithmetic2DPlus,

--- a/lm_eval/tasks/apps.py
+++ b/lm_eval/tasks/apps.py
+"""
+Measuring Coding Challenge Competence With APPS
+https://arxiv.org/pdf/2105.09938
+@article{hendrycksapps2021,
+  title={Measuring Coding Challenge Competence With APPS},
+  author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt},
+  journal={NeurIPS},
+  year={2021}
+}
+"""
+from lm_eval.base import Task
+from pathlib import Path
+from best_download import download_file 
+import xml.etree.ElementTree as ET
+from lm_eval.base import rf
+from lm_eval.metrics import mean,perplexity
+import numpy as np
+from zipfile import ZipFile
+import os 
+class Apps(Task):
+    VERSION = 0
+    DATASET_PATH = Path("data/asdiv")
+    def download(self):
+        if self.DATASET_PATH.exists():
+            return
+        Path.mkdir(self.DATASET_PATH)
+        url = "https://people.eecs.berkeley.edu/~hendrycks/APPS.tar.gz"
+        checksum = "2f71f8003929d605369ad924be4b95c15879fc2bfac0d4d01a81f8aabceaad5c"
+        zip_path = self.DATASET_PATH / "master.zip"
+        download_file(url, str(zip_path), checksum)
+        with ZipFile(zip_path, "r") as zip:
+            zip.extractall(self.DATASET_PATH)
+        os.remove(zip_path)
+    def _convert_standard(self, problem):
+        #TODO: include solution-type and formula
+        out_doc = {
+            "question" : problem.find('Question').text,
+            "body" : problem.find('Body').text,
+            "answer": problem.find('Answer').text
+        }
+        return out_doc
+    def load_docs(self, textfilename, tfds=False):
+        tree = ET.parse(textfilename)
+        root = tree.getroot()
+        for pid, problem in enumerate(root.iter('Problem')):
+            out_doc = self._convert_standard(problem)
+            yield out_doc
+    def _strip_bracket(self,test_str):
+        ret = ''
+        skip1c = 0
+        skip2c = 0
+        for i in test_str:
+            if i == '(':
+                skip2c += 1
+            elif i == ')'and skip2c > 0:
+                skip2c -= 1
+            elif skip1c == 0 and skip2c == 0:
+                ret += i
+        return ret
+    def has_training_docs(self):
+        return False
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        raise NotImplementedError("This dataset has no training docs")
+    def test_docs(self):
+        raise NotImplementedError("This dataset has no test docs")
+    def validation_docs(self):
+        data_xml_path = self.DATASET_PATH / "nlu-asdiv-dataset-master/dataset/ASDiv.xml"
+        return self.load_docs(data_xml_path)
+    def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
+        assert num_fewshot == 0, "ASDiv is intended only for the zero-shot setting."
+        return super().fewshot_context(doc, num_fewshot, provide_description, rnd)
+    def fewshot_description(self):
+        # TODO: add solution-type and formula
+        desc = "information containing the context of the question\nQuestion: Text of a question.\nAnswer: Answer to the question, based on the passage.\n"
+        return desc
+    def doc_to_text(self, doc):
+        # TODO: add solution-type
+        return doc['body'] + '\n' + 'Question:' + doc['question'] + '\n' + 'Answer:'
+    def doc_to_target(self, doc):
+        # TODO: add formula
+        answer = self._strip_bracket(doc['answer'])
+        if len(answer)>0: # check if answer is present only in brackets
+            return answer
+        else:
+            return doc['answer']
+    def construct_requests(self, doc, ctx):
+        ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
+        return ll, is_greedy
+    def process_results(self, doc, results):
+        ll, is_greedy = results
+        return {
+            'acc': int(is_greedy)
+        }
+    def aggregation(self):
+        return {
+            'acc': mean
+        }
+    def higher_is_better(self):
+        return {
+            'acc': True
+        }
--- a/lm_eval/tasks/asdiv.py
+++ b/lm_eval/tasks/asdiv.py
+"""
+ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers
+https://arxiv.org/abs/2106.15772
+@misc{miao2021diverse,
+      title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
+      author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
+      year={2021},
+      eprint={2106.15772},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI}
+}
+"""
+from lm_eval.base import Task
+from pathlib import Path
+from best_download import download_file 
+import xml.etree.ElementTree as ET
+from lm_eval.base import rf
+from lm_eval.metrics import mean,perplexity
+import numpy as np
+from zipfile import ZipFile
+import os 
+#currently ignoring formula for answer generation
+# given a subset, splits return the docs 
+class Asdiv(Task):
+    VERSION = 0
+    DATASET_PATH = Path("data/asdiv")
+    def download(self):
+        if self.DATASET_PATH.exists():
+            return
+        Path.mkdir(self.DATASET_PATH)
+        url = "https://github.com/chaochun/nlu-asdiv-dataset/archive/refs/heads/master.zip"
+        checksum = "2f71f8003929d605369ad924be4b95c15879fc2bfac0d4d01a81f8aabceaad5c"
+        zip_path = self.DATASET_PATH / "master.zip"
+        download_file(url, str(zip_path), checksum)
+        with ZipFile(zip_path, "r") as zip:
+            zip.extractall(self.DATASET_PATH)
+        os.remove(zip_path)
+    def _convert_standard(self, problem):
+        #TODO: include solution-type and formula
+        out_doc = {
+            "question" : problem.find('Question').text,
+            "body" : problem.find('Body').text,
+            "answer": problem.find('Answer').text
+        }
+        return out_doc
+    def load_docs(self, textfilename, tfds=False):
+        tree = ET.parse(textfilename)
+        root = tree.getroot()
+        for pid, problem in enumerate(root.iter('Problem')):
+            out_doc = self._convert_standard(problem)
+            yield out_doc
+    def _strip_bracket(self,test_str):
+        ret = ''
+        skip1c = 0
+        skip2c = 0
+        for i in test_str:
+            if i == '(':
+                skip2c += 1
+            elif i == ')'and skip2c > 0:
+                skip2c -= 1
+            elif skip1c == 0 and skip2c == 0:
+                ret += i
+        return ret
+    def has_training_docs(self):
+        return False
+    def has_validation_docs(self):
+        return True
+    def has_test_docs(self):
+        return False
+    def training_docs(self):
+        raise NotImplementedError("This dataset has no training docs")
+    def test_docs(self):
+        raise NotImplementedError("This dataset has no test docs")
+    def validation_docs(self):
+        data_xml_path = self.DATASET_PATH / "nlu-asdiv-dataset-master/dataset/ASDiv.xml"
+        return self.load_docs(data_xml_path)
+    def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
+        assert num_fewshot == 0, "ASDiv is intended only for the zero-shot setting."
+        return super().fewshot_context(doc, num_fewshot, provide_description, rnd)
+    def fewshot_description(self):
+        # TODO: add solution-type and formula
+        desc = "information containing the context of the question\nQuestion: Text of a question.\nAnswer: Answer to the question, based on the passage.\n"
+        return desc
+    def doc_to_text(self, doc):
+        # TODO: add solution-type
+        return doc['body'] + '\n' + 'Question:' + doc['question'] + '\n' + 'Answer:'
+    def doc_to_target(self, doc):
+        # TODO: add formula
+        answer = self._strip_bracket(doc['answer'])
+        if len(answer)>0: # check if answer is present only in brackets
+            return answer
+        else:
+            return doc['answer']
+    def construct_requests(self, doc, ctx):
+        ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
+        return ll, is_greedy
+    def process_results(self, doc, results):
+        ll, is_greedy = results
+        return {
+            'acc': int(is_greedy)
+        }
+    def aggregation(self):
+        return {
+            'acc': mean
+        }
+    def higher_is_better(self):
+        return {
+            'acc': True
+        }