Commit 55e62507 authored by researcher2's avatar researcher2
Browse files

Merge branch 'master' into researcher2

parents bb0eafbb 26f0233f
...@@ -2,10 +2,9 @@ from . common import HFTask ...@@ -2,10 +2,9 @@ from . common import HFTask
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
class HeadQA(HFTask, MultipleChoiceTask): class HeadQABase(HFTask, MultipleChoiceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "head_qa" DATASET_PATH = "head_qa"
DATASET_NAME = None
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -25,9 +24,19 @@ class HeadQA(HFTask, MultipleChoiceTask): ...@@ -25,9 +24,19 @@ class HeadQA(HFTask, MultipleChoiceTask):
} }
return out_doc return out_doc
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
class HeadQAEn(HeadQABase):
DATASET_NAME = "en"
class HeadQAEs(HeadQABase):
DATASET_NAME = "es"
# for backwards compatibility
class HeadQAEsDeprecated(HeadQABase):
DATASET_NAME = "es"
def __init__(self):
super().__init__()
print("WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info.")
\ No newline at end of file
...@@ -35,11 +35,6 @@ class HellaSwag(HFTask, MultipleChoiceTask): ...@@ -35,11 +35,6 @@ class HellaSwag(HFTask, MultipleChoiceTask):
} }
return out_doc return out_doc
def fewshot_description(self):
return "Label for the relevant action: Sentences describing the " \
"context, with an incomplete sentence trailing\nanswer that " \
"plausibly completes the situation."
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
......
...@@ -20,7 +20,7 @@ class Ethics(Task): ...@@ -20,7 +20,7 @@ class Ethics(Task):
def download(self): def download(self):
if not os.path.exists('data/ethics/done'): if not os.path.exists('data/ethics/done'):
sh("mkdir -p data") sh("mkdir -p data")
download_file("https://people.eecs.berkeley.edu/~hendrycks/ethics.tar", "data/ethics.tar", "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333") download_file("https://people.eecs.berkeley.edu/~hendrycks/ethics.tar", local_file="data/ethics.tar", expected_checksum="40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333")
sh(""" sh("""
tar -xf data/ethics.tar -C data/ tar -xf data/ethics.tar -C data/
rm data/ethics.tar rm data/ethics.tar
...@@ -237,9 +237,6 @@ class EthicsUtilitarianismOriginal(Ethics): ...@@ -237,9 +237,6 @@ class EthicsUtilitarianismOriginal(Ethics):
for doc in docs: for doc in docs:
yield {"activity": doc[0], "baseline": doc[1], "rating": ""} yield {"activity": doc[0], "baseline": doc[1], "rating": ""}
def fewshot_description(self):
return "Rate how pleasant each of the following activities is on a scale from 1 (very unpleasant) to 10 (very pleasant).\n\n"
def fewshot_examples(self, k, rnd): def fewshot_examples(self, k, rnd):
# Overwriting fewshot examples as k can be max 5 # Overwriting fewshot examples as k can be max 5
assert k <= 5, "There are only 5 possible shots for this task. Refer to the V2 for more." assert k <= 5, "There are only 5 possible shots for this task. Refer to the V2 for more."
...@@ -350,9 +347,6 @@ class EthicsVirtue(Ethics): ...@@ -350,9 +347,6 @@ class EthicsVirtue(Ethics):
def get_prefix(self): def get_prefix(self):
return "virtue/virtue" return "virtue/virtue"
def fewshot_description(self):
return "The following is a list of sentences and traits, along with whether the trait is exhibited in that sentence.\n\n"
def process_doc(self, doc): def process_doc(self, doc):
# Append identifiers before shuffling to calculate exact matches lateron & skip the first element of headers # Append identifiers before shuffling to calculate exact matches lateron & skip the first element of headers
return [x + [i] for i, x in enumerate(doc[1:])] return [x + [i] for i, x in enumerate(doc[1:])]
......
...@@ -18,7 +18,7 @@ class Math(Task): ...@@ -18,7 +18,7 @@ class Math(Task):
def download(self): def download(self):
if not (self.DATASET_PATH / 'test').exists() or not (self.DATASET_PATH / 'done').exists(): if not (self.DATASET_PATH / 'test').exists() or not (self.DATASET_PATH / 'done').exists():
sh(f"mkdir -p {self.DATASET_PATH}") sh(f"mkdir -p {self.DATASET_PATH}")
download_file("https://people.eecs.berkeley.edu/~hendrycks/MATH.tar", f"{self.DATASET_PATH}.tar", "01256fd7cd5430596fdf07e6e6a5827111b5235b7ffed679c662a12f898932da") download_file("https://people.eecs.berkeley.edu/~hendrycks/MATH.tar", local_file=f"{self.DATASET_PATH}.tar", expected_checksum="01256fd7cd5430596fdf07e6e6a5827111b5235b7ffed679c662a12f898932da")
sh(f""" sh(f"""
tar -xf {self.DATASET_PATH}.tar -C data/ && touch {self.DATASET_PATH / 'done'} tar -xf {self.DATASET_PATH}.tar -C data/ && touch {self.DATASET_PATH / 'done'}
rm {self.DATASET_PATH}.tar rm {self.DATASET_PATH}.tar
...@@ -55,9 +55,6 @@ class Math(Task): ...@@ -55,9 +55,6 @@ class Math(Task):
def test_docs(self): def test_docs(self):
return self._load_docs(self.DATASET_PATH / "test" / self.get_file_info()) return self._load_docs(self.DATASET_PATH / "test" / self.get_file_info())
def fewshot_description(self):
return "Given a mathematics problem, determine the answer. Simplify your answer as much as possible."
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "Problem: " + doc["problem"] + "\nAnswer:" return "Problem: " + doc["problem"] + "\nAnswer:"
......
...@@ -45,7 +45,7 @@ class GeneralHendrycksTest(MultipleChoiceTask): ...@@ -45,7 +45,7 @@ class GeneralHendrycksTest(MultipleChoiceTask):
def download(self): def download(self):
if not (self.DATASET_PATH / 'done').exists(): if not (self.DATASET_PATH / 'done').exists():
sh("mkdir -p data") sh("mkdir -p data")
download_file("https://people.eecs.berkeley.edu/~hendrycks/data.tar", "data/data.tar", "78a804365a59028188fb19bd1adcadc5e0c260b220a9d8b2e33a5ea7d5fbe3b4") download_file("https://people.eecs.berkeley.edu/~hendrycks/data.tar", local_file="data/data.tar", expected_checksum="78a804365a59028188fb19bd1adcadc5e0c260b220a9d8b2e33a5ea7d5fbe3b4")
sh(""" sh("""
tar -xf data/data.tar -C data/ tar -xf data/data.tar -C data/
rm data/data.tar rm data/data.tar
...@@ -114,9 +114,5 @@ class GeneralHendrycksTest(MultipleChoiceTask): ...@@ -114,9 +114,5 @@ class GeneralHendrycksTest(MultipleChoiceTask):
return rnd.sample(list(self._fewshot_docs), k) return rnd.sample(list(self._fewshot_docs), k)
def fewshot_description(self):
subject = self.subject.replace("_", " ")
return f"The following are multiple choice questions (with answers) about {subject}."
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
...@@ -14,8 +14,8 @@ class LAMBADA(Task): ...@@ -14,8 +14,8 @@ class LAMBADA(Task):
if not os.path.exists("data/lambada/lambada_test.jsonl"): if not os.path.exists("data/lambada/lambada_test.jsonl"):
download_file( download_file(
"http://eaidata.bmk.sh/data/lambada_test.jsonl", "http://eaidata.bmk.sh/data/lambada_test.jsonl",
"data/lambada/lambada_test.jsonl", local_file="data/lambada/lambada_test.jsonl",
"4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226" expected_checksum="4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"
) )
except: except:
# fallback - for some reason best_download doesnt work all the time here # fallback - for some reason best_download doesnt work all the time here
...@@ -54,10 +54,6 @@ class LAMBADA(Task): ...@@ -54,10 +54,6 @@ class LAMBADA(Task):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + doc['text'].rsplit(' ', 1)[1] return " " + doc['text'].rsplit(' ', 1)[1]
def fewshot_description(self):
# TODO: figure out description
return ""
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc)) ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
......
...@@ -13,6 +13,3 @@ class LAMBADA_cloze(LAMBADA): ...@@ -13,6 +13,3 @@ class LAMBADA_cloze(LAMBADA):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + doc['text'].rsplit(' ', 1)[1] return " " + doc['text'].rsplit(' ', 1)[1]
def fewshot_description(self):
return "Fill in blank:\n"
...@@ -32,8 +32,8 @@ class MultilingualLAMBADA(lambada.LAMBADA): ...@@ -32,8 +32,8 @@ class MultilingualLAMBADA(lambada.LAMBADA):
if not os.path.exists(f): if not os.path.exists(f):
download_file( download_file(
url, url,
f, local_file=f,
CHECKSUMS[self.LANG] expected_checksum=CHECKSUMS[self.LANG]
) )
except: except:
# fallback - for some reason best_download doesnt work all the time here # fallback - for some reason best_download doesnt work all the time here
......
...@@ -19,7 +19,7 @@ class LogiQA(MultipleChoiceTask): ...@@ -19,7 +19,7 @@ class LogiQA(MultipleChoiceTask):
] ]
for split in splits: for split in splits:
file = self.DATASET_PATH / f"{split['name']}.txt" file = self.DATASET_PATH / f"{split['name']}.txt"
download_file(f"{base_url}/{split['name']}.txt", str(file), split["checksum"]) download_file(f"{base_url}/{split['name']}.txt", local_file=str(file), expected_checksum=split["checksum"])
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -81,10 +81,6 @@ class LogiQA(MultipleChoiceTask): ...@@ -81,10 +81,6 @@ class LogiQA(MultipleChoiceTask):
def test_docs(self): def test_docs(self):
return self._load_docs(self.DATASET_PATH / "Test.txt") return self._load_docs(self.DATASET_PATH / "Test.txt")
def fewshot_description(self):
# TODO: figure out actual description
return ""
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
......
...@@ -29,10 +29,6 @@ class MathQA(HFTask, MultipleChoiceTask): ...@@ -29,10 +29,6 @@ class MathQA(HFTask, MultipleChoiceTask):
} }
return out_doc return out_doc
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
......
...@@ -39,9 +39,6 @@ class MCTACO(HFTask): ...@@ -39,9 +39,6 @@ class MCTACO(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def fewshot_description(self):
return "Determine whether the candidate answer is plausible (\"yes\") or not (\"no\")"
def doc_to_text(self, doc): def doc_to_text(self, doc):
return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\ return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
f"Answer: {doc['answer']}\nPlausible:" f"Answer: {doc['answer']}\nPlausible:"
......
...@@ -36,8 +36,8 @@ class MuTualBase(Task): ...@@ -36,8 +36,8 @@ class MuTualBase(Task):
master_zip = Path("data/master.zip") master_zip = Path("data/master.zip")
download_file( download_file(
"https://github.com/Nealcly/MuTual/archive/master.zip", "https://github.com/Nealcly/MuTual/archive/master.zip",
str(master_zip), local_file=str(master_zip),
"bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9") expected_checksum="bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9")
with zipfile.ZipFile(master_zip, 'r') as zip: with zipfile.ZipFile(master_zip, 'r') as zip:
zip.extractall("data") zip.extractall("data")
Path("data/MuTual-master/data").rename(str(self.BASE_PATH)) Path("data/MuTual-master/data").rename(str(self.BASE_PATH))
...@@ -70,10 +70,6 @@ class MuTualBase(Task): ...@@ -70,10 +70,6 @@ class MuTualBase(Task):
def test_docs(self): def test_docs(self):
return NotImplemented return NotImplemented
def fewshot_description(self):
# TODO: figure out fewshot description
return ""
def doc_to_text(self, doc): def doc_to_text(self, doc):
return self.detokenize(doc["article"]) return self.detokenize(doc["article"])
......
...@@ -21,10 +21,6 @@ class NaturalQs(HFTask): ...@@ -21,10 +21,6 @@ class NaturalQs(HFTask):
def has_test_docs(self): def has_test_docs(self):
return False return False
def fewshot_description(self):
# TODO: figure out description
return ""
def training_docs(self): def training_docs(self):
# Cache training for faster few-shot. # Cache training for faster few-shot.
# Data is too large to fit in memory. # Data is too large to fit in memory.
......
...@@ -25,9 +25,5 @@ class OpenBookQA(HFTask, MultipleChoiceTask): ...@@ -25,9 +25,5 @@ class OpenBookQA(HFTask, MultipleChoiceTask):
} }
return out_doc return out_doc
def fewshot_description(self):
# TODO: figure out fewshot description
return ""
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
...@@ -10,7 +10,7 @@ from best_download import download_file ...@@ -10,7 +10,7 @@ from best_download import download_file
class PilePerplexityTask(PerplexityTask, abc.ABC): class PilePerplexityTask(PerplexityTask, abc.ABC):
VERSION = 0 VERSION = 1
PILE_SET_NAME = None PILE_SET_NAME = None
VAL_PATH = 'data/pile/val.jsonl.zst' VAL_PATH = 'data/pile/val.jsonl.zst'
...@@ -18,9 +18,11 @@ class PilePerplexityTask(PerplexityTask, abc.ABC): ...@@ -18,9 +18,11 @@ class PilePerplexityTask(PerplexityTask, abc.ABC):
def download(self): def download(self):
# TODO: separate pile val/test out by component so we don't have to scan the entire file once per set # TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
if not os.path.exists("data/pile/test.jsonl.zst"):
# todo use new best_download fallback api
os.makedirs("data/pile/", exist_ok=True) os.makedirs("data/pile/", exist_ok=True)
download_file("https://the-eye.eu/public/AI/pile/val.jsonl.zst", self.VAL_PATH, "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92") download_file("http://eaidata.bmk.sh/data/pile/val.jsonl.zst", local_file=self.VAL_PATH, expected_checksum="264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
download_file("https://the-eye.eu/public/AI/pile/test.jsonl.zst", self.TEST_PATH, "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e") download_file("http://eaidata.bmk.sh/data/pile/test.jsonl.zst", local_file=self.TEST_PATH, expected_checksum="0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
def validation_docs(self): def validation_docs(self):
rdr = lm_dataformat.Reader(self.VAL_PATH) rdr = lm_dataformat.Reader(self.VAL_PATH)
......
...@@ -18,10 +18,6 @@ class PiQA(HFTask, MultipleChoiceTask): ...@@ -18,10 +18,6 @@ class PiQA(HFTask, MultipleChoiceTask):
def has_test_docs(self): def has_test_docs(self):
return False return False
def fewshot_description(self):
# TODO: figure out fewshot description
return ""
def _convert_standard(self, doc): def _convert_standard(self, doc):
out_doc = { out_doc = {
"goal": doc["goal"], "goal": doc["goal"],
......
...@@ -36,13 +36,14 @@ class PROST(HFTask, MultipleChoiceTask): ...@@ -36,13 +36,14 @@ class PROST(HFTask, MultipleChoiceTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def fewshot_description(self): def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
# TODO: figure out fewshot description
return ""
def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
assert num_fewshot == 0, 'PROST is designed to probe models in a zero-shot fashion only.' assert num_fewshot == 0, 'PROST is designed to probe models in a zero-shot fashion only.'
return super().fewshot_context(doc, num_fewshot, provide_description, rnd) return super().fewshot_context(
doc=doc,
num_fewshot=num_fewshot,
rnd=rnd,
description=description
)
def _convert_standard(self, doc): def _convert_standard(self, doc):
out_doc = { out_doc = {
......
...@@ -23,11 +23,6 @@ class Pubmed_QA(HFTask): ...@@ -23,11 +23,6 @@ class Pubmed_QA(HFTask):
# HF is labelled as train but its really just for testing # HF is labelled as train but its really just for testing
return self.data["train"] return self.data["train"]
def fewshot_description(self):
# Average ctx length in labelled dataset is 238.9
# 2 few-shot exmamples pushes it beyond context window
return ""
def doc_to_text(self, doc): def doc_to_text(self, doc):
ctxs = "\n".join(doc["context"]["contexts"]) ctxs = "\n".join(doc["context"]["contexts"])
return "Abstract: {}\nQuestion: {}\nAnswer:".format( return "Abstract: {}\nQuestion: {}\nAnswer:".format(
......
...@@ -32,8 +32,8 @@ class QA4MRE(MultipleChoiceTask): ...@@ -32,8 +32,8 @@ class QA4MRE(MultipleChoiceTask):
if not os.path.isfile(f"data/qa4mre/QA4MRE-{year}-{lang}"): if not os.path.isfile(f"data/qa4mre/QA4MRE-{year}-{lang}"):
download_file( download_file(
url_path, url_path,
f"data/qa4mre/QA4MRE-{year}-{lang}_GS.xml", local_file=f"data/qa4mre/QA4MRE-{year}-{lang}_GS.xml",
sha256sums[year], expected_checksum=sha256sums[year],
) )
def has_training_docs(self): def has_training_docs(self):
...@@ -67,9 +67,6 @@ class QA4MRE(MultipleChoiceTask): ...@@ -67,9 +67,6 @@ class QA4MRE(MultipleChoiceTask):
out_doc['source'] = src out_doc['source'] = src
yield out_doc yield out_doc
def fewshot_description(self):
return ""
def test_docs(self): def test_docs(self):
return self.load_docs(f"data/qa4mre/QA4MRE-{self.YEAR}-EN_GS.xml") return self.load_docs(f"data/qa4mre/QA4MRE-{self.YEAR}-EN_GS.xml")
......
...@@ -51,11 +51,6 @@ class QuAC(Task): ...@@ -51,11 +51,6 @@ class QuAC(Task):
def test_docs(self): def test_docs(self):
raise NotImplementedError("QuAC has no test docs.") raise NotImplementedError("QuAC has no test docs.")
def fewshot_description(self):
# TODO: figure out fewshot description
desc = "TITLE: Title of the context passage - subtitle of the passage\nPARAGRAPH: Passage describing the relevant information for answering questions.\n\nQ: Text of a question.\n\nA: Answer to the question, based on the passage. If it cannot be answered based on the passage, write CANNOTANSWER"
return desc
def load_doc(self, myjson): def load_doc(self, myjson):
docs = [] docs = []
for item in myjson: for item in myjson:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment