Commit 105fa974 authored by Leo Gao's avatar Leo Gao
Browse files

Add task versioning

parent f76e6367
......@@ -10,6 +10,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())]
results = collections.defaultdict(dict)
versions = collections.defaultdict(dict)
requests = collections.defaultdict(list)
requests_origin = collections.defaultdict(list)
......@@ -24,6 +25,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
# get lists of each type of requeste
for task_name, task in task_dict_items:
versions[task_name] = task.VERSION
#default to test doc, fall back to val doc if validation unavailable
# TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
if task.has_test_docs():
......@@ -95,4 +97,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
if stderr is not None:
results[task_name][metric + "_stderr"] = stderr(items)
return results
return {
"results": results,
"versions": versions
}
......@@ -5,6 +5,7 @@ from . common import HFTask
class ANLIBase(HFTask):
VERSION = 0
DATASET_PATH = "anli"
DATASET_NAME = None
SPLIT = None
......
......@@ -3,6 +3,7 @@ from . common import HFTask
class ARCEasy(HFTask, MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "ai2_arc"
DATASET_NAME = "ARC-Easy"
......
......@@ -10,6 +10,7 @@ ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion'])
class Arithmetic(Task):
VERSION = 0
directory = 'data/arithmetic/'
def __init__(self):
......
......@@ -15,6 +15,8 @@ class CBTBase(HFTask):
DATASET_PATH = "cbt"
DATASET_NAME = None
VERSION = 0
def fewshot_description(self):
# TODO: Figure out description.
return ""
......
......@@ -7,6 +7,7 @@ from itertools import zip_longest
class CoQA(Task):
VERSION = 0
def download(self):
coqa_train_filepath = 'data/coqa/coqa-train-v1.0.json'
......
......@@ -16,6 +16,7 @@ https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_r
class DROP(Task):
VERSION = 0
DATASET_PATH = Path("data/drop")
def download(self):
......
......@@ -8,6 +8,7 @@ from ..utils import general_detokenize
class CoLA(HFTask):
VERSION = 0
DATASET_PATH = "glue"
DATASET_NAME = "cola"
......@@ -55,6 +56,7 @@ class CoLA(HFTask):
class SST(HFTask):
VERSION = 0
DATASET_PATH = "glue"
DATASET_NAME = "sst2"
......@@ -106,6 +108,7 @@ class SST(HFTask):
class MNLI(HFTask):
VERSION = 0
DATASET_PATH = "glue"
DATASET_NAME = "mnli"
......@@ -163,6 +166,7 @@ class MNLI(HFTask):
class MNLIMismatched(MNLI):
VERSION = 0
def validation_docs(self):
if self.has_validation_docs():
......@@ -174,6 +178,7 @@ class MNLIMismatched(MNLI):
class QNLI(HFTask):
VERSION = 0
DATASET_PATH = "glue"
DATASET_NAME = "qnli"
......@@ -222,6 +227,7 @@ class QNLI(HFTask):
class WNLI(HFTask):
VERSION = 0
DATASET_PATH = "glue"
DATASET_NAME = "wnli"
......@@ -271,6 +277,7 @@ class WNLI(HFTask):
class RTE(HFTask):
VERSION = 0
DATASET_PATH = "glue"
DATASET_NAME = "rte"
......@@ -322,6 +329,7 @@ class RTE(HFTask):
class MRPC(HFTask):
VERSION = 0
DATASET_PATH = "glue"
DATASET_NAME = "mrpc"
......@@ -374,6 +382,7 @@ class MRPC(HFTask):
class QQP(HFTask):
VERSION = 0
DATASET_PATH = "glue"
DATASET_NAME = "qqp"
......@@ -426,6 +435,7 @@ class QQP(HFTask):
class STSB(HFTask):
VERSION = 0
DATASET_PATH = "glue"
DATASET_NAME = "stsb"
......
......@@ -3,6 +3,7 @@ from lm_eval.base import MultipleChoiceTask
class HeadQA(HFTask, MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "head_qa"
DATASET_NAME = None
......
......@@ -4,6 +4,7 @@ from . common import HFTask
class HellaSwag(HFTask, MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "hellaswag"
DATASET_NAME = None
......
......@@ -85,6 +85,7 @@ class Ethics(Task):
class EthicsCM(Ethics):
VERSION = 0
# Ignoring "ambiguous" extra dataset for now
def get_prefix(self):
return "commonsense/cm"
......@@ -123,6 +124,7 @@ class EthicsCM(Ethics):
class EthicsDeontology(Ethics):
VERSION = 0
def get_prefix(self):
return "deontology/deontology"
......@@ -172,6 +174,7 @@ class EthicsDeontology(Ethics):
class EthicsJustice(Ethics):
VERSION = 0
def get_prefix(self):
return "justice/justice"
......@@ -220,6 +223,7 @@ class EthicsJustice(Ethics):
class EthicsUtilitarianismOriginal(Ethics):
VERSION = 0
def get_prefix(self):
return "utilitarianism/util"
......@@ -287,6 +291,7 @@ class EthicsUtilitarianismOriginal(Ethics):
class EthicsUtilitarianism(Ethics):
VERSION = 0
"""
This is a variation of the original Utilitarianism task used in the paper, where the situations are directly compared.
This allows scaling to >5 shots.
......@@ -339,6 +344,7 @@ class EthicsUtilitarianism(Ethics):
class EthicsVirtue(Ethics):
VERSION = 0
def get_prefix(self):
return "virtue/virtue"
......
......@@ -287,35 +287,42 @@ class Math(Task):
class MathAlgebra(Math):
VERSION = 0
def get_file_info(self):
return 'algebra'
class MathCountingAndProbability(Math):
VERSION = 0
def get_file_info(self):
return 'counting_and_probability'
class MathGeometry(Math):
VERSION = 0
def get_file_info(self):
return 'geometry'
class MathIntermediateAlgebra(Math):
VERSION = 0
def get_file_info(self):
return 'intermediate_algebra'
class MathNumberTheory(Math):
VERSION = 0
def get_file_info(self):
return 'number_theory'
class MathPrealgebra(Math):
VERSION = 0
def get_file_info(self):
return 'prealgebra'
class MathPrecalculus(Math):
VERSION = 0
def get_file_info(self):
return 'precalculus'
......@@ -34,6 +34,7 @@ def create_task(subject):
class GeneralHendrycksTest(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = Path("data/hendrycksTest/")
def __init__(self, subject):
......
......@@ -6,6 +6,7 @@ from best_download import download_file
class LAMBADA(Task):
VERSION = 0
def download(self):
sh("mkdir -p data/lambada")
sh("wget http://eaidata.bmk.sh/data/lambada_test.jsonl -O data/lambada/lambada_test.jsonl")
......
......@@ -7,6 +7,7 @@ from best_download import download_file
class LAMBADA_cloze(LAMBADA):
VERSION = 0
def doc_to_text(self, doc):
return doc['text'].rsplit(' ', 1)[0] + " ____. ->"
......
......@@ -4,6 +4,7 @@ from pathlib import Path
class LogiQA(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = Path("data/logiqa")
def download(self):
......
......@@ -4,6 +4,7 @@ from . common import HFTask
class MathQA(HFTask, MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "math_qa"
DATASET_NAME = None
......
......@@ -4,6 +4,7 @@ from itertools import islice
class NaturalQs(HFTask):
VERSION = 0
# TODO: naturalqs has a *really* large train set that huggingface just
# automatically downloads even if you dont use it. we should try and only
# download the val set and not even bother with the train set.
......
......@@ -3,6 +3,7 @@ from .common import HFTask
class OpenBookQA(HFTask, MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "openbookqa"
DATASET_NAME = "main"
......
......@@ -10,6 +10,7 @@ from best_download import download_file
class PilePerplexityTask(PerplexityTask, abc.ABC):
VERSION = 0
PILE_SET_NAME = None
VAL_PATH = 'data/pile/val.jsonl.zst'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment