Commit b0a12310 authored by rokosbasilisk's avatar rokosbasilisk
Browse files

add asdiv task

parent f16e8b5c
python main.py --model gpt2 --model_args pretrained=EleutherAI/gpt-neo-125M --device cuda:0 --tasks math_asdiv
This diff is collapsed.
LICENSE.md
README.md
setup.py
lm_eval/__init__.py
lm_eval/base.py
lm_eval/evaluator.py
lm_eval/metrics.py
lm_eval/utils.py
lm_eval.egg-info/PKG-INFO
lm_eval.egg-info/SOURCES.txt
lm_eval.egg-info/dependency_links.txt
lm_eval.egg-info/requires.txt
lm_eval.egg-info/top_level.txt
lm_eval/models/__init__.py
lm_eval/models/dummy.py
lm_eval/models/gpt2.py
lm_eval/models/gpt3.py
lm_eval/tasks/__init__.py
lm_eval/tasks/anli.py
lm_eval/tasks/arc.py
lm_eval/tasks/arithmetic.py
lm_eval/tasks/blimp.py
lm_eval/tasks/cbt.py
lm_eval/tasks/common.py
lm_eval/tasks/coqa.py
lm_eval/tasks/drop.py
lm_eval/tasks/glue.py
lm_eval/tasks/headqa.py
lm_eval/tasks/hellaswag.py
lm_eval/tasks/hendrycks_ethics.py
lm_eval/tasks/hendrycks_math.py
lm_eval/tasks/hendrycks_test.py
lm_eval/tasks/lambada.py
lm_eval/tasks/lambada_cloze.py
lm_eval/tasks/lambada_multilingual.py
lm_eval/tasks/logiqa.py
lm_eval/tasks/mathqa.py
lm_eval/tasks/mc_taco.py
lm_eval/tasks/mutual.py
lm_eval/tasks/naturalqs.py
lm_eval/tasks/openbookqa.py
lm_eval/tasks/pile.py
lm_eval/tasks/piqa.py
lm_eval/tasks/prost.py
lm_eval/tasks/pubmedqa.py
lm_eval/tasks/qa4mre.py
lm_eval/tasks/quac.py
lm_eval/tasks/race.py
lm_eval/tasks/sat.py
lm_eval/tasks/sciq.py
lm_eval/tasks/squad.py
lm_eval/tasks/storycloze.py
lm_eval/tasks/superglue.py
lm_eval/tasks/translation.py
lm_eval/tasks/triviaqa.py
lm_eval/tasks/truthfulqa.py
lm_eval/tasks/unscramble.py
lm_eval/tasks/webqs.py
lm_eval/tasks/wikitext.py
lm_eval/tasks/winogrande.py
lm_eval/tasks/wsc273.py
scripts/__init__.py
scripts/cost_estimate.py
scripts/fewshot_description_experiment.py
scripts/get_prompts.py
scripts/make_gpt2_test_cases.py
scripts/make_table_tasks.py
scripts/write_out.py
scripts/clean_training_data/__init__.py
scripts/clean_training_data/archiver.py
scripts/clean_training_data/generate_13_grams.py
scripts/clean_training_data/janitor.py
scripts/clean_training_data/process_sorted_buckets.py
scripts/clean_training_data/sort_13_gram_buckets.py
\ No newline at end of file
black
best_download>=0.0.6
datasets==1.15.1
click>=7.1
scikit-learn>=0.24.1
torch>=1.7
transformers>=4.1
sqlitedict==1.6.0
pytablewriter==0.58.0
sacrebleu==1.5.0
rouge-score==0.0.4
bleurt@ https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
pycountry==20.7.3
numexpr==2.7.2
lm_dataformat==0.0.20
pytest==6.2.3
pybind11==2.6.2
tqdm-multiprocess==0.0.11
zstandard==0.15.2
jsonlines==2.0.0
mock==4.0.3
openai==0.6.4
jieba==0.42.1
nagisa==0.2.7
...@@ -45,6 +45,7 @@ from . import lambada_multilingual ...@@ -45,6 +45,7 @@ from . import lambada_multilingual
from . import mutual from . import mutual
from . import truthfulqa from . import truthfulqa
from . import blimp from . import blimp
from . import asdiv
######################################## ########################################
# Translation tasks # Translation tasks
...@@ -164,6 +165,7 @@ TASK_REGISTRY = { ...@@ -164,6 +165,7 @@ TASK_REGISTRY = {
"math_num_theory": hendrycks_math.MathNumberTheory, "math_num_theory": hendrycks_math.MathNumberTheory,
"math_prealgebra": hendrycks_math.MathPrealgebra, "math_prealgebra": hendrycks_math.MathPrealgebra,
"math_precalc": hendrycks_math.MathPrecalculus, "math_precalc": hendrycks_math.MathPrecalculus,
"math_asdiv": asdiv.Asdiv,
# arithmetic # arithmetic
"arithmetic_2da": arithmetic.Arithmetic2DPlus, "arithmetic_2da": arithmetic.Arithmetic2DPlus,
......
"""
Measuring Coding Challenge Competence With APPS
https://arxiv.org/pdf/2105.09938
@article{hendrycksapps2021,
title={Measuring Coding Challenge Competence With APPS},
author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt},
journal={NeurIPS},
year={2021}
}
"""
from lm_eval.base import Task
from pathlib import Path
from best_download import download_file
import xml.etree.ElementTree as ET
from lm_eval.base import rf
from lm_eval.metrics import mean,perplexity
import numpy as np
from zipfile import ZipFile
import os
class Apps(Task):
VERSION = 0
DATASET_PATH = Path("data/asdiv")
def download(self):
if self.DATASET_PATH.exists():
return
Path.mkdir(self.DATASET_PATH)
url = "https://people.eecs.berkeley.edu/~hendrycks/APPS.tar.gz"
checksum = "2f71f8003929d605369ad924be4b95c15879fc2bfac0d4d01a81f8aabceaad5c"
zip_path = self.DATASET_PATH / "master.zip"
download_file(url, str(zip_path), checksum)
with ZipFile(zip_path, "r") as zip:
zip.extractall(self.DATASET_PATH)
os.remove(zip_path)
def _convert_standard(self, problem):
#TODO: include solution-type and formula
out_doc = {
"question" : problem.find('Question').text,
"body" : problem.find('Body').text,
"answer": problem.find('Answer').text
}
return out_doc
def load_docs(self, textfilename, tfds=False):
tree = ET.parse(textfilename)
root = tree.getroot()
for pid, problem in enumerate(root.iter('Problem')):
out_doc = self._convert_standard(problem)
yield out_doc
def _strip_bracket(self,test_str):
ret = ''
skip1c = 0
skip2c = 0
for i in test_str:
if i == '(':
skip2c += 1
elif i == ')'and skip2c > 0:
skip2c -= 1
elif skip1c == 0 and skip2c == 0:
ret += i
return ret
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def training_docs(self):
raise NotImplementedError("This dataset has no training docs")
def test_docs(self):
raise NotImplementedError("This dataset has no test docs")
def validation_docs(self):
data_xml_path = self.DATASET_PATH / "nlu-asdiv-dataset-master/dataset/ASDiv.xml"
return self.load_docs(data_xml_path)
def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
assert num_fewshot == 0, "ASDiv is intended only for the zero-shot setting."
return super().fewshot_context(doc, num_fewshot, provide_description, rnd)
def fewshot_description(self):
# TODO: add solution-type and formula
desc = "information containing the context of the question\nQuestion: Text of a question.\nAnswer: Answer to the question, based on the passage.\n"
return desc
def doc_to_text(self, doc):
# TODO: add solution-type
return doc['body'] + '\n' + 'Question:' + doc['question'] + '\n' + 'Answer:'
def doc_to_target(self, doc):
# TODO: add formula
answer = self._strip_bracket(doc['answer'])
if len(answer)>0: # check if answer is present only in brackets
return answer
else:
return doc['answer']
def construct_requests(self, doc, ctx):
ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
return ll, is_greedy
def process_results(self, doc, results):
ll, is_greedy = results
return {
'acc': int(is_greedy)
}
def aggregation(self):
return {
'acc': mean
}
def higher_is_better(self):
return {
'acc': True
}
"""
ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers
https://arxiv.org/abs/2106.15772
@misc{miao2021diverse,
title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
year={2021},
eprint={2106.15772},
archivePrefix={arXiv},
primaryClass={cs.AI}
}
"""
from lm_eval.base import Task
from pathlib import Path
from best_download import download_file
import xml.etree.ElementTree as ET
from lm_eval.base import rf
from lm_eval.metrics import mean,perplexity
import numpy as np
from zipfile import ZipFile
import os
#currently ignoring formula for answer generation
# given a subset, splits return the docs
class Asdiv(Task):
VERSION = 0
DATASET_PATH = Path("data/asdiv")
def download(self):
if self.DATASET_PATH.exists():
return
Path.mkdir(self.DATASET_PATH)
url = "https://github.com/chaochun/nlu-asdiv-dataset/archive/refs/heads/master.zip"
checksum = "2f71f8003929d605369ad924be4b95c15879fc2bfac0d4d01a81f8aabceaad5c"
zip_path = self.DATASET_PATH / "master.zip"
download_file(url, str(zip_path), checksum)
with ZipFile(zip_path, "r") as zip:
zip.extractall(self.DATASET_PATH)
os.remove(zip_path)
def _convert_standard(self, problem):
#TODO: include solution-type and formula
out_doc = {
"question" : problem.find('Question').text,
"body" : problem.find('Body').text,
"answer": problem.find('Answer').text
}
return out_doc
def load_docs(self, textfilename, tfds=False):
tree = ET.parse(textfilename)
root = tree.getroot()
for pid, problem in enumerate(root.iter('Problem')):
out_doc = self._convert_standard(problem)
yield out_doc
def _strip_bracket(self,test_str):
ret = ''
skip1c = 0
skip2c = 0
for i in test_str:
if i == '(':
skip2c += 1
elif i == ')'and skip2c > 0:
skip2c -= 1
elif skip1c == 0 and skip2c == 0:
ret += i
return ret
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def training_docs(self):
raise NotImplementedError("This dataset has no training docs")
def test_docs(self):
raise NotImplementedError("This dataset has no test docs")
def validation_docs(self):
data_xml_path = self.DATASET_PATH / "nlu-asdiv-dataset-master/dataset/ASDiv.xml"
return self.load_docs(data_xml_path)
def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
assert num_fewshot == 0, "ASDiv is intended only for the zero-shot setting."
return super().fewshot_context(doc, num_fewshot, provide_description, rnd)
def fewshot_description(self):
# TODO: add solution-type and formula
desc = "information containing the context of the question\nQuestion: Text of a question.\nAnswer: Answer to the question, based on the passage.\n"
return desc
def doc_to_text(self, doc):
# TODO: add solution-type
return doc['body'] + '\n' + 'Question:' + doc['question'] + '\n' + 'Answer:'
def doc_to_target(self, doc):
# TODO: add formula
answer = self._strip_bracket(doc['answer'])
if len(answer)>0: # check if answer is present only in brackets
return answer
else:
return doc['answer']
def construct_requests(self, doc, ctx):
ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
return ll, is_greedy
def process_results(self, doc, results):
ll, is_greedy = results
return {
'acc': int(is_greedy)
}
def aggregation(self):
return {
'acc': mean
}
def higher_is_better(self):
return {
'acc': True
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment