""" Measuring Coding Challenge Competence With APPS https://arxiv.org/pdf/2105.09938 @article{hendrycksapps2021, title={Measuring Coding Challenge Competence With APPS}, author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt}, journal={NeurIPS}, year={2021} } """ from lm_eval.base import Task from pathlib import Path from best_download import download_file import xml.etree.ElementTree as ET from lm_eval.base import rf from lm_eval.metrics import mean,perplexity import numpy as np from zipfile import ZipFile import os class Apps(Task): VERSION = 0 DATASET_PATH = Path("data/asdiv") def download(self): if self.DATASET_PATH.exists(): return Path.mkdir(self.DATASET_PATH) url = "https://people.eecs.berkeley.edu/~hendrycks/APPS.tar.gz" checksum = "2f71f8003929d605369ad924be4b95c15879fc2bfac0d4d01a81f8aabceaad5c" zip_path = self.DATASET_PATH / "master.zip" download_file(url, str(zip_path), checksum) with ZipFile(zip_path, "r") as zip: zip.extractall(self.DATASET_PATH) os.remove(zip_path) def _convert_standard(self, problem): #TODO: include solution-type and formula out_doc = { "question" : problem.find('Question').text, "body" : problem.find('Body').text, "answer": problem.find('Answer').text } return out_doc def load_docs(self, textfilename, tfds=False): tree = ET.parse(textfilename) root = tree.getroot() for pid, problem in enumerate(root.iter('Problem')): out_doc = self._convert_standard(problem) yield out_doc def _strip_bracket(self,test_str): ret = '' skip1c = 0 skip2c = 0 for i in test_str: if i == '(': skip2c += 1 elif i == ')'and skip2c > 0: skip2c -= 1 elif skip1c == 0 and skip2c == 0: ret += i return ret def has_training_docs(self): return False def has_validation_docs(self): return True def has_test_docs(self): return False def training_docs(self): raise NotImplementedError("This dataset has no training docs") def test_docs(self): raise NotImplementedError("This dataset has no test docs") def validation_docs(self): data_xml_path = self.DATASET_PATH / "nlu-asdiv-dataset-master/dataset/ASDiv.xml" return self.load_docs(data_xml_path) def fewshot_context(self, doc, num_fewshot, provide_description, rnd): assert num_fewshot == 0, "ASDiv is intended only for the zero-shot setting." return super().fewshot_context(doc, num_fewshot, provide_description, rnd) def fewshot_description(self): # TODO: add solution-type and formula desc = "information containing the context of the question\nQuestion: Text of a question.\nAnswer: Answer to the question, based on the passage.\n" return desc def doc_to_text(self, doc): # TODO: add solution-type return doc['body'] + '\n' + 'Question:' + doc['question'] + '\n' + 'Answer:' def doc_to_target(self, doc): # TODO: add formula answer = self._strip_bracket(doc['answer']) if len(answer)>0: # check if answer is present only in brackets return answer else: return doc['answer'] def construct_requests(self, doc, ctx): ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc)) return ll, is_greedy def process_results(self, doc, results): ll, is_greedy = results return { 'acc': int(is_greedy) } def aggregation(self): return { 'acc': mean } def higher_is_better(self): return { 'acc': True }