Commit 55e62507 authored by researcher2's avatar researcher2
Browse files

Merge branch 'master' into researcher2

parents bb0eafbb 26f0233f
......@@ -2,10 +2,9 @@ from . common import HFTask
from lm_eval.base import MultipleChoiceTask
class HeadQA(HFTask, MultipleChoiceTask):
class HeadQABase(HFTask, MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "head_qa"
DATASET_NAME = None
def has_training_docs(self):
return True
......@@ -25,9 +24,19 @@ class HeadQA(HFTask, MultipleChoiceTask):
}
return out_doc
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc):
return doc["query"]
class HeadQAEn(HeadQABase):
DATASET_NAME = "en"
class HeadQAEs(HeadQABase):
DATASET_NAME = "es"
# for backwards compatibility
class HeadQAEsDeprecated(HeadQABase):
DATASET_NAME = "es"
def __init__(self):
super().__init__()
print("WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info.")
\ No newline at end of file
......@@ -35,11 +35,6 @@ class HellaSwag(HFTask, MultipleChoiceTask):
}
return out_doc
def fewshot_description(self):
return "Label for the relevant action: Sentences describing the " \
"context, with an incomplete sentence trailing\nanswer that " \
"plausibly completes the situation."
def doc_to_text(self, doc):
return doc["query"]
......
......@@ -20,7 +20,7 @@ class Ethics(Task):
def download(self):
if not os.path.exists('data/ethics/done'):
sh("mkdir -p data")
download_file("https://people.eecs.berkeley.edu/~hendrycks/ethics.tar", "data/ethics.tar", "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333")
download_file("https://people.eecs.berkeley.edu/~hendrycks/ethics.tar", local_file="data/ethics.tar", expected_checksum="40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333")
sh("""
tar -xf data/ethics.tar -C data/
rm data/ethics.tar
......@@ -237,9 +237,6 @@ class EthicsUtilitarianismOriginal(Ethics):
for doc in docs:
yield {"activity": doc[0], "baseline": doc[1], "rating": ""}
def fewshot_description(self):
return "Rate how pleasant each of the following activities is on a scale from 1 (very unpleasant) to 10 (very pleasant).\n\n"
def fewshot_examples(self, k, rnd):
# Overwriting fewshot examples as k can be max 5
assert k <= 5, "There are only 5 possible shots for this task. Refer to the V2 for more."
......@@ -350,9 +347,6 @@ class EthicsVirtue(Ethics):
def get_prefix(self):
return "virtue/virtue"
def fewshot_description(self):
return "The following is a list of sentences and traits, along with whether the trait is exhibited in that sentence.\n\n"
def process_doc(self, doc):
# Append identifiers before shuffling to calculate exact matches lateron & skip the first element of headers
return [x + [i] for i, x in enumerate(doc[1:])]
......
......@@ -18,7 +18,7 @@ class Math(Task):
def download(self):
if not (self.DATASET_PATH / 'test').exists() or not (self.DATASET_PATH / 'done').exists():
sh(f"mkdir -p {self.DATASET_PATH}")
download_file("https://people.eecs.berkeley.edu/~hendrycks/MATH.tar", f"{self.DATASET_PATH}.tar", "01256fd7cd5430596fdf07e6e6a5827111b5235b7ffed679c662a12f898932da")
download_file("https://people.eecs.berkeley.edu/~hendrycks/MATH.tar", local_file=f"{self.DATASET_PATH}.tar", expected_checksum="01256fd7cd5430596fdf07e6e6a5827111b5235b7ffed679c662a12f898932da")
sh(f"""
tar -xf {self.DATASET_PATH}.tar -C data/ && touch {self.DATASET_PATH / 'done'}
rm {self.DATASET_PATH}.tar
......@@ -55,9 +55,6 @@ class Math(Task):
def test_docs(self):
return self._load_docs(self.DATASET_PATH / "test" / self.get_file_info())
def fewshot_description(self):
return "Given a mathematics problem, determine the answer. Simplify your answer as much as possible."
def doc_to_text(self, doc):
return "Problem: " + doc["problem"] + "\nAnswer:"
......
......@@ -45,7 +45,7 @@ class GeneralHendrycksTest(MultipleChoiceTask):
def download(self):
if not (self.DATASET_PATH / 'done').exists():
sh("mkdir -p data")
download_file("https://people.eecs.berkeley.edu/~hendrycks/data.tar", "data/data.tar", "78a804365a59028188fb19bd1adcadc5e0c260b220a9d8b2e33a5ea7d5fbe3b4")
download_file("https://people.eecs.berkeley.edu/~hendrycks/data.tar", local_file="data/data.tar", expected_checksum="78a804365a59028188fb19bd1adcadc5e0c260b220a9d8b2e33a5ea7d5fbe3b4")
sh("""
tar -xf data/data.tar -C data/
rm data/data.tar
......@@ -114,9 +114,5 @@ class GeneralHendrycksTest(MultipleChoiceTask):
return rnd.sample(list(self._fewshot_docs), k)
def fewshot_description(self):
subject = self.subject.replace("_", " ")
return f"The following are multiple choice questions (with answers) about {subject}."
def doc_to_text(self, doc):
return doc["query"]
......@@ -14,8 +14,8 @@ class LAMBADA(Task):
if not os.path.exists("data/lambada/lambada_test.jsonl"):
download_file(
"http://eaidata.bmk.sh/data/lambada_test.jsonl",
"data/lambada/lambada_test.jsonl",
"4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"
local_file="data/lambada/lambada_test.jsonl",
expected_checksum="4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"
)
except:
# fallback - for some reason best_download doesnt work all the time here
......@@ -53,10 +53,6 @@ class LAMBADA(Task):
def doc_to_target(self, doc):
return " " + doc['text'].rsplit(' ', 1)[1]
def fewshot_description(self):
# TODO: figure out description
return ""
def construct_requests(self, doc, ctx):
ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
......
......@@ -13,6 +13,3 @@ class LAMBADA_cloze(LAMBADA):
def doc_to_target(self, doc):
return " " + doc['text'].rsplit(' ', 1)[1]
def fewshot_description(self):
return "Fill in blank:\n"
......@@ -32,8 +32,8 @@ class MultilingualLAMBADA(lambada.LAMBADA):
if not os.path.exists(f):
download_file(
url,
f,
CHECKSUMS[self.LANG]
local_file=f,
expected_checksum=CHECKSUMS[self.LANG]
)
except:
# fallback - for some reason best_download doesnt work all the time here
......
......@@ -19,7 +19,7 @@ class LogiQA(MultipleChoiceTask):
]
for split in splits:
file = self.DATASET_PATH / f"{split['name']}.txt"
download_file(f"{base_url}/{split['name']}.txt", str(file), split["checksum"])
download_file(f"{base_url}/{split['name']}.txt", local_file=str(file), expected_checksum=split["checksum"])
def has_training_docs(self):
return True
......@@ -81,10 +81,6 @@ class LogiQA(MultipleChoiceTask):
def test_docs(self):
return self._load_docs(self.DATASET_PATH / "Test.txt")
def fewshot_description(self):
# TODO: figure out actual description
return ""
def doc_to_text(self, doc):
return doc["query"]
......
......@@ -29,10 +29,6 @@ class MathQA(HFTask, MultipleChoiceTask):
}
return out_doc
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc):
return doc["query"]
......
......@@ -39,9 +39,6 @@ class MCTACO(HFTask):
def has_test_docs(self):
return True
def fewshot_description(self):
return "Determine whether the candidate answer is plausible (\"yes\") or not (\"no\")"
def doc_to_text(self, doc):
return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
f"Answer: {doc['answer']}\nPlausible:"
......
......@@ -36,8 +36,8 @@ class MuTualBase(Task):
master_zip = Path("data/master.zip")
download_file(
"https://github.com/Nealcly/MuTual/archive/master.zip",
str(master_zip),
"bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9")
local_file=str(master_zip),
expected_checksum="bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9")
with zipfile.ZipFile(master_zip, 'r') as zip:
zip.extractall("data")
Path("data/MuTual-master/data").rename(str(self.BASE_PATH))
......@@ -70,10 +70,6 @@ class MuTualBase(Task):
def test_docs(self):
return NotImplemented
def fewshot_description(self):
# TODO: figure out fewshot description
return ""
def doc_to_text(self, doc):
return self.detokenize(doc["article"])
......
......@@ -21,10 +21,6 @@ class NaturalQs(HFTask):
def has_test_docs(self):
return False
def fewshot_description(self):
# TODO: figure out description
return ""
def training_docs(self):
# Cache training for faster few-shot.
# Data is too large to fit in memory.
......
......@@ -25,9 +25,5 @@ class OpenBookQA(HFTask, MultipleChoiceTask):
}
return out_doc
def fewshot_description(self):
# TODO: figure out fewshot description
return ""
def doc_to_text(self, doc):
return doc["query"]
......@@ -10,7 +10,7 @@ from best_download import download_file
class PilePerplexityTask(PerplexityTask, abc.ABC):
VERSION = 0
VERSION = 1
PILE_SET_NAME = None
VAL_PATH = 'data/pile/val.jsonl.zst'
......@@ -18,9 +18,11 @@ class PilePerplexityTask(PerplexityTask, abc.ABC):
def download(self):
# TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
os.makedirs("data/pile/", exist_ok=True)
download_file("https://the-eye.eu/public/AI/pile/val.jsonl.zst", self.VAL_PATH, "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
download_file("https://the-eye.eu/public/AI/pile/test.jsonl.zst", self.TEST_PATH, "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
if not os.path.exists("data/pile/test.jsonl.zst"):
# todo use new best_download fallback api
os.makedirs("data/pile/", exist_ok=True)
download_file("http://eaidata.bmk.sh/data/pile/val.jsonl.zst", local_file=self.VAL_PATH, expected_checksum="264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
download_file("http://eaidata.bmk.sh/data/pile/test.jsonl.zst", local_file=self.TEST_PATH, expected_checksum="0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
def validation_docs(self):
rdr = lm_dataformat.Reader(self.VAL_PATH)
......
......@@ -18,10 +18,6 @@ class PiQA(HFTask, MultipleChoiceTask):
def has_test_docs(self):
return False
def fewshot_description(self):
# TODO: figure out fewshot description
return ""
def _convert_standard(self, doc):
out_doc = {
"goal": doc["goal"],
......
......@@ -36,13 +36,14 @@ class PROST(HFTask, MultipleChoiceTask):
def has_test_docs(self):
return True
def fewshot_description(self):
# TODO: figure out fewshot description
return ""
def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
assert num_fewshot == 0, 'PROST is designed to probe models in a zero-shot fashion only.'
return super().fewshot_context(doc, num_fewshot, provide_description, rnd)
return super().fewshot_context(
doc=doc,
num_fewshot=num_fewshot,
rnd=rnd,
description=description
)
def _convert_standard(self, doc):
out_doc = {
......
......@@ -23,11 +23,6 @@ class Pubmed_QA(HFTask):
# HF is labelled as train but its really just for testing
return self.data["train"]
def fewshot_description(self):
# Average ctx length in labelled dataset is 238.9
# 2 few-shot exmamples pushes it beyond context window
return ""
def doc_to_text(self, doc):
ctxs = "\n".join(doc["context"]["contexts"])
return "Abstract: {}\nQuestion: {}\nAnswer:".format(
......
......@@ -32,8 +32,8 @@ class QA4MRE(MultipleChoiceTask):
if not os.path.isfile(f"data/qa4mre/QA4MRE-{year}-{lang}"):
download_file(
url_path,
f"data/qa4mre/QA4MRE-{year}-{lang}_GS.xml",
sha256sums[year],
local_file=f"data/qa4mre/QA4MRE-{year}-{lang}_GS.xml",
expected_checksum=sha256sums[year],
)
def has_training_docs(self):
......@@ -67,9 +67,6 @@ class QA4MRE(MultipleChoiceTask):
out_doc['source'] = src
yield out_doc
def fewshot_description(self):
return ""
def test_docs(self):
return self.load_docs(f"data/qa4mre/QA4MRE-{self.YEAR}-EN_GS.xml")
......
......@@ -51,11 +51,6 @@ class QuAC(Task):
def test_docs(self):
raise NotImplementedError("QuAC has no test docs.")
def fewshot_description(self):
# TODO: figure out fewshot description
desc = "TITLE: Title of the context passage - subtitle of the passage\nPARAGRAPH: Passage describing the relevant information for answering questions.\n\nQ: Text of a question.\n\nA: Answer to the question, based on the passage. If it cannot be answered based on the passage, write CANNOTANSWER"
return desc
def load_doc(self, myjson):
docs = []
for item in myjson:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment