Commit 105fa974 authored by Leo Gao's avatar Leo Gao
Browse files

Add task versioning

parent f76e6367
...@@ -10,6 +10,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i ...@@ -10,6 +10,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())] task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())]
results = collections.defaultdict(dict) results = collections.defaultdict(dict)
versions = collections.defaultdict(dict)
requests = collections.defaultdict(list) requests = collections.defaultdict(list)
requests_origin = collections.defaultdict(list) requests_origin = collections.defaultdict(list)
...@@ -24,6 +25,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i ...@@ -24,6 +25,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
# get lists of each type of requeste # get lists of each type of requeste
for task_name, task in task_dict_items: for task_name, task in task_dict_items:
versions[task_name] = task.VERSION
#default to test doc, fall back to val doc if validation unavailable #default to test doc, fall back to val doc if validation unavailable
# TODO: the test-fallback-to-val system isn't final, we should revisit it at some point # TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
if task.has_test_docs(): if task.has_test_docs():
...@@ -95,4 +97,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i ...@@ -95,4 +97,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
if stderr is not None: if stderr is not None:
results[task_name][metric + "_stderr"] = stderr(items) results[task_name][metric + "_stderr"] = stderr(items)
return results return {
"results": results,
"versions": versions
}
...@@ -5,6 +5,7 @@ from . common import HFTask ...@@ -5,6 +5,7 @@ from . common import HFTask
class ANLIBase(HFTask): class ANLIBase(HFTask):
VERSION = 0
DATASET_PATH = "anli" DATASET_PATH = "anli"
DATASET_NAME = None DATASET_NAME = None
SPLIT = None SPLIT = None
......
...@@ -3,6 +3,7 @@ from . common import HFTask ...@@ -3,6 +3,7 @@ from . common import HFTask
class ARCEasy(HFTask, MultipleChoiceTask): class ARCEasy(HFTask, MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "ai2_arc" DATASET_PATH = "ai2_arc"
DATASET_NAME = "ARC-Easy" DATASET_NAME = "ARC-Easy"
......
...@@ -10,6 +10,7 @@ ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion']) ...@@ -10,6 +10,7 @@ ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion'])
class Arithmetic(Task): class Arithmetic(Task):
VERSION = 0
directory = 'data/arithmetic/' directory = 'data/arithmetic/'
def __init__(self): def __init__(self):
......
...@@ -15,6 +15,8 @@ class CBTBase(HFTask): ...@@ -15,6 +15,8 @@ class CBTBase(HFTask):
DATASET_PATH = "cbt" DATASET_PATH = "cbt"
DATASET_NAME = None DATASET_NAME = None
VERSION = 0
def fewshot_description(self): def fewshot_description(self):
# TODO: Figure out description. # TODO: Figure out description.
return "" return ""
......
...@@ -7,6 +7,7 @@ from itertools import zip_longest ...@@ -7,6 +7,7 @@ from itertools import zip_longest
class CoQA(Task): class CoQA(Task):
VERSION = 0
def download(self): def download(self):
coqa_train_filepath = 'data/coqa/coqa-train-v1.0.json' coqa_train_filepath = 'data/coqa/coqa-train-v1.0.json'
......
...@@ -16,6 +16,7 @@ https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_r ...@@ -16,6 +16,7 @@ https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_r
class DROP(Task): class DROP(Task):
VERSION = 0
DATASET_PATH = Path("data/drop") DATASET_PATH = Path("data/drop")
def download(self): def download(self):
......
...@@ -8,6 +8,7 @@ from ..utils import general_detokenize ...@@ -8,6 +8,7 @@ from ..utils import general_detokenize
class CoLA(HFTask): class CoLA(HFTask):
VERSION = 0
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "cola" DATASET_NAME = "cola"
...@@ -55,6 +56,7 @@ class CoLA(HFTask): ...@@ -55,6 +56,7 @@ class CoLA(HFTask):
class SST(HFTask): class SST(HFTask):
VERSION = 0
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "sst2" DATASET_NAME = "sst2"
...@@ -106,6 +108,7 @@ class SST(HFTask): ...@@ -106,6 +108,7 @@ class SST(HFTask):
class MNLI(HFTask): class MNLI(HFTask):
VERSION = 0
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "mnli" DATASET_NAME = "mnli"
...@@ -163,6 +166,7 @@ class MNLI(HFTask): ...@@ -163,6 +166,7 @@ class MNLI(HFTask):
class MNLIMismatched(MNLI): class MNLIMismatched(MNLI):
VERSION = 0
def validation_docs(self): def validation_docs(self):
if self.has_validation_docs(): if self.has_validation_docs():
...@@ -174,6 +178,7 @@ class MNLIMismatched(MNLI): ...@@ -174,6 +178,7 @@ class MNLIMismatched(MNLI):
class QNLI(HFTask): class QNLI(HFTask):
VERSION = 0
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "qnli" DATASET_NAME = "qnli"
...@@ -222,6 +227,7 @@ class QNLI(HFTask): ...@@ -222,6 +227,7 @@ class QNLI(HFTask):
class WNLI(HFTask): class WNLI(HFTask):
VERSION = 0
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "wnli" DATASET_NAME = "wnli"
...@@ -271,6 +277,7 @@ class WNLI(HFTask): ...@@ -271,6 +277,7 @@ class WNLI(HFTask):
class RTE(HFTask): class RTE(HFTask):
VERSION = 0
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "rte" DATASET_NAME = "rte"
...@@ -322,6 +329,7 @@ class RTE(HFTask): ...@@ -322,6 +329,7 @@ class RTE(HFTask):
class MRPC(HFTask): class MRPC(HFTask):
VERSION = 0
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "mrpc" DATASET_NAME = "mrpc"
...@@ -374,6 +382,7 @@ class MRPC(HFTask): ...@@ -374,6 +382,7 @@ class MRPC(HFTask):
class QQP(HFTask): class QQP(HFTask):
VERSION = 0
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "qqp" DATASET_NAME = "qqp"
...@@ -426,6 +435,7 @@ class QQP(HFTask): ...@@ -426,6 +435,7 @@ class QQP(HFTask):
class STSB(HFTask): class STSB(HFTask):
VERSION = 0
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "stsb" DATASET_NAME = "stsb"
......
...@@ -3,6 +3,7 @@ from lm_eval.base import MultipleChoiceTask ...@@ -3,6 +3,7 @@ from lm_eval.base import MultipleChoiceTask
class HeadQA(HFTask, MultipleChoiceTask): class HeadQA(HFTask, MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "head_qa" DATASET_PATH = "head_qa"
DATASET_NAME = None DATASET_NAME = None
......
...@@ -4,6 +4,7 @@ from . common import HFTask ...@@ -4,6 +4,7 @@ from . common import HFTask
class HellaSwag(HFTask, MultipleChoiceTask): class HellaSwag(HFTask, MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "hellaswag" DATASET_PATH = "hellaswag"
DATASET_NAME = None DATASET_NAME = None
......
...@@ -85,6 +85,7 @@ class Ethics(Task): ...@@ -85,6 +85,7 @@ class Ethics(Task):
class EthicsCM(Ethics): class EthicsCM(Ethics):
VERSION = 0
# Ignoring "ambiguous" extra dataset for now # Ignoring "ambiguous" extra dataset for now
def get_prefix(self): def get_prefix(self):
return "commonsense/cm" return "commonsense/cm"
...@@ -123,6 +124,7 @@ class EthicsCM(Ethics): ...@@ -123,6 +124,7 @@ class EthicsCM(Ethics):
class EthicsDeontology(Ethics): class EthicsDeontology(Ethics):
VERSION = 0
def get_prefix(self): def get_prefix(self):
return "deontology/deontology" return "deontology/deontology"
...@@ -172,6 +174,7 @@ class EthicsDeontology(Ethics): ...@@ -172,6 +174,7 @@ class EthicsDeontology(Ethics):
class EthicsJustice(Ethics): class EthicsJustice(Ethics):
VERSION = 0
def get_prefix(self): def get_prefix(self):
return "justice/justice" return "justice/justice"
...@@ -220,6 +223,7 @@ class EthicsJustice(Ethics): ...@@ -220,6 +223,7 @@ class EthicsJustice(Ethics):
class EthicsUtilitarianismOriginal(Ethics): class EthicsUtilitarianismOriginal(Ethics):
VERSION = 0
def get_prefix(self): def get_prefix(self):
return "utilitarianism/util" return "utilitarianism/util"
...@@ -287,6 +291,7 @@ class EthicsUtilitarianismOriginal(Ethics): ...@@ -287,6 +291,7 @@ class EthicsUtilitarianismOriginal(Ethics):
class EthicsUtilitarianism(Ethics): class EthicsUtilitarianism(Ethics):
VERSION = 0
""" """
This is a variation of the original Utilitarianism task used in the paper, where the situations are directly compared. This is a variation of the original Utilitarianism task used in the paper, where the situations are directly compared.
This allows scaling to >5 shots. This allows scaling to >5 shots.
...@@ -339,6 +344,7 @@ class EthicsUtilitarianism(Ethics): ...@@ -339,6 +344,7 @@ class EthicsUtilitarianism(Ethics):
class EthicsVirtue(Ethics): class EthicsVirtue(Ethics):
VERSION = 0
def get_prefix(self): def get_prefix(self):
return "virtue/virtue" return "virtue/virtue"
......
...@@ -287,35 +287,42 @@ class Math(Task): ...@@ -287,35 +287,42 @@ class Math(Task):
class MathAlgebra(Math): class MathAlgebra(Math):
VERSION = 0
def get_file_info(self): def get_file_info(self):
return 'algebra' return 'algebra'
class MathCountingAndProbability(Math): class MathCountingAndProbability(Math):
VERSION = 0
def get_file_info(self): def get_file_info(self):
return 'counting_and_probability' return 'counting_and_probability'
class MathGeometry(Math): class MathGeometry(Math):
VERSION = 0
def get_file_info(self): def get_file_info(self):
return 'geometry' return 'geometry'
class MathIntermediateAlgebra(Math): class MathIntermediateAlgebra(Math):
VERSION = 0
def get_file_info(self): def get_file_info(self):
return 'intermediate_algebra' return 'intermediate_algebra'
class MathNumberTheory(Math): class MathNumberTheory(Math):
VERSION = 0
def get_file_info(self): def get_file_info(self):
return 'number_theory' return 'number_theory'
class MathPrealgebra(Math): class MathPrealgebra(Math):
VERSION = 0
def get_file_info(self): def get_file_info(self):
return 'prealgebra' return 'prealgebra'
class MathPrecalculus(Math): class MathPrecalculus(Math):
VERSION = 0
def get_file_info(self): def get_file_info(self):
return 'precalculus' return 'precalculus'
...@@ -34,6 +34,7 @@ def create_task(subject): ...@@ -34,6 +34,7 @@ def create_task(subject):
class GeneralHendrycksTest(MultipleChoiceTask): class GeneralHendrycksTest(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = Path("data/hendrycksTest/") DATASET_PATH = Path("data/hendrycksTest/")
def __init__(self, subject): def __init__(self, subject):
......
...@@ -6,6 +6,7 @@ from best_download import download_file ...@@ -6,6 +6,7 @@ from best_download import download_file
class LAMBADA(Task): class LAMBADA(Task):
VERSION = 0
def download(self): def download(self):
sh("mkdir -p data/lambada") sh("mkdir -p data/lambada")
sh("wget http://eaidata.bmk.sh/data/lambada_test.jsonl -O data/lambada/lambada_test.jsonl") sh("wget http://eaidata.bmk.sh/data/lambada_test.jsonl -O data/lambada/lambada_test.jsonl")
......
...@@ -7,6 +7,7 @@ from best_download import download_file ...@@ -7,6 +7,7 @@ from best_download import download_file
class LAMBADA_cloze(LAMBADA): class LAMBADA_cloze(LAMBADA):
VERSION = 0
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc['text'].rsplit(' ', 1)[0] + " ____. ->" return doc['text'].rsplit(' ', 1)[0] + " ____. ->"
......
...@@ -4,6 +4,7 @@ from pathlib import Path ...@@ -4,6 +4,7 @@ from pathlib import Path
class LogiQA(MultipleChoiceTask): class LogiQA(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = Path("data/logiqa") DATASET_PATH = Path("data/logiqa")
def download(self): def download(self):
......
...@@ -4,6 +4,7 @@ from . common import HFTask ...@@ -4,6 +4,7 @@ from . common import HFTask
class MathQA(HFTask, MultipleChoiceTask): class MathQA(HFTask, MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "math_qa" DATASET_PATH = "math_qa"
DATASET_NAME = None DATASET_NAME = None
......
...@@ -4,6 +4,7 @@ from itertools import islice ...@@ -4,6 +4,7 @@ from itertools import islice
class NaturalQs(HFTask): class NaturalQs(HFTask):
VERSION = 0
# TODO: naturalqs has a *really* large train set that huggingface just # TODO: naturalqs has a *really* large train set that huggingface just
# automatically downloads even if you dont use it. we should try and only # automatically downloads even if you dont use it. we should try and only
# download the val set and not even bother with the train set. # download the val set and not even bother with the train set.
......
...@@ -3,6 +3,7 @@ from .common import HFTask ...@@ -3,6 +3,7 @@ from .common import HFTask
class OpenBookQA(HFTask, MultipleChoiceTask): class OpenBookQA(HFTask, MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "openbookqa" DATASET_PATH = "openbookqa"
DATASET_NAME = "main" DATASET_NAME = "main"
......
...@@ -10,6 +10,7 @@ from best_download import download_file ...@@ -10,6 +10,7 @@ from best_download import download_file
class PilePerplexityTask(PerplexityTask, abc.ABC): class PilePerplexityTask(PerplexityTask, abc.ABC):
VERSION = 0
PILE_SET_NAME = None PILE_SET_NAME = None
VAL_PATH = 'data/pile/val.jsonl.zst' VAL_PATH = 'data/pile/val.jsonl.zst'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment