Commit 7c9da714 authored by Jonathan Tow's avatar Jonathan Tow Committed by Jon Tow
Browse files

Refactor `Task` download

parent 7064d6b9
...@@ -7,14 +7,12 @@ modified from Chinese high school English listening comprehension test data. ...@@ -7,14 +7,12 @@ modified from Chinese high school English listening comprehension test data.
Homepage: https://github.com/Nealcly/MuTual Homepage: https://github.com/Nealcly/MuTual
""" """
import json
import zipfile
import shutil
import numpy as np import numpy as np
from pathlib import Path import inspect
import lm_eval.datasets.mutual.mutual
from lm_eval.base import Task, rf from lm_eval.base import Task, rf
from lm_eval.metrics import mean from lm_eval.metrics import mean
from best_download import download_file
_CITATION = """ _CITATION = """
...@@ -30,29 +28,10 @@ _CITATION = """ ...@@ -30,29 +28,10 @@ _CITATION = """
class MuTualBase(Task): class MuTualBase(Task):
VERSION = 1 VERSION = 1
BASE_PATH = Path("data/mutual") DATASET_PATH = inspect.getfile(lm_eval.datasets.mutual.mutual)
DATASET_NAME = None DATASET_NAME = None
CHOICES = ['A', 'B', 'C', 'D'] CHOICES = ['A', 'B', 'C', 'D']
def __init__(self):
super().__init__()
def download(self):
if self.BASE_PATH.exists():
return
Path.mkdir(self.BASE_PATH, parents=True)
master_zip = Path("data/master.zip")
download_file(
"https://github.com/Nealcly/MuTual/archive/master.zip",
local_file=str(master_zip),
expected_checksum="bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9")
with zipfile.ZipFile(master_zip, 'r') as zip:
zip.extractall("data")
Path("data/MuTual-master/data").rename(str(self.BASE_PATH))
# Remove left over files and directories.
master_zip.unlink()
shutil.rmtree("data/MuTual-master")
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -62,18 +41,11 @@ class MuTualBase(Task): ...@@ -62,18 +41,11 @@ class MuTualBase(Task):
def has_test_docs(self): def has_test_docs(self):
return False return False
def _load_docs(self, path):
for file in sorted(path.iterdir()):
if file.suffix != ".txt":
continue
with open(file, 'r', encoding='utf-8') as f:
yield json.load(f)
def training_docs(self): def training_docs(self):
return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "train") return self.dataset["train"]
def validation_docs(self): def validation_docs(self):
return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "dev") return self.dataset["validation"]
def test_docs(self): def test_docs(self):
return NotImplemented return NotImplemented
...@@ -134,8 +106,8 @@ class MuTualBase(Task): ...@@ -134,8 +106,8 @@ class MuTualBase(Task):
class MuTual(MuTualBase): class MuTual(MuTualBase):
DATASET_NAME = Path("mutual") DATASET_NAME = "mutual"
class MuTualPlus(MuTualBase): class MuTualPlus(MuTualBase):
DATASET_NAME = Path("mutual_plus") DATASET_NAME = "mutual_plus"
...@@ -15,8 +15,7 @@ not even bother with the train set. ...@@ -15,8 +15,7 @@ not even bother with the train set.
Homepage: https://ai.google.com/research/NaturalQuestions Homepage: https://ai.google.com/research/NaturalQuestions
""" """
import random from lm_eval.base import Task
from . common import HFTask
from itertools import islice from itertools import islice
...@@ -30,7 +29,7 @@ _CITATION = """ ...@@ -30,7 +29,7 @@ _CITATION = """
""" """
class NaturalQs(HFTask): class NaturalQs(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = "natural_questions" DATASET_PATH = "natural_questions"
DATASET_NAME = None DATASET_NAME = None
...@@ -47,7 +46,12 @@ class NaturalQs(HFTask): ...@@ -47,7 +46,12 @@ class NaturalQs(HFTask):
def training_docs(self): def training_docs(self):
# Cache training for faster few-shot. # Cache training for faster few-shot.
# Data is too large to fit in memory. # Data is too large to fit in memory.
return self.data["train"] if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
return self.dataset["validation"]
def fewshot_examples(self, k, rnd): def fewshot_examples(self, k, rnd):
# Data is too large to fit in memory. We just sample from the first bit. # Data is too large to fit in memory. We just sample from the first bit.
......
...@@ -15,7 +15,6 @@ based algorithm and a word co-occurrence algorithm. ...@@ -15,7 +15,6 @@ based algorithm and a word co-occurrence algorithm.
Homepage: https://allenai.org/data/open-book-qa Homepage: https://allenai.org/data/open-book-qa
""" """
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
from .common import HFTask
_CITATION = """ _CITATION = """
...@@ -28,7 +27,7 @@ _CITATION = """ ...@@ -28,7 +27,7 @@ _CITATION = """
""" """
class OpenBookQA(HFTask, MultipleChoiceTask): class OpenBookQA(MultipleChoiceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "openbookqa" DATASET_PATH = "openbookqa"
DATASET_NAME = "main" DATASET_NAME = "main"
...@@ -42,6 +41,17 @@ class OpenBookQA(HFTask, MultipleChoiceTask): ...@@ -42,6 +41,17 @@ class OpenBookQA(HFTask, MultipleChoiceTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def training_docs(self):
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return map(self._convert_standard, self._training_docs)
def validation_docs(self):
return map(self._convert_standard, self.dataset["validation"])
def test_docs(self):
return map(self._convert_standard, self.dataset["test"])
def _convert_standard(self, doc): def _convert_standard(self, doc):
out_doc = { out_doc = {
"id": doc["id"], "id": doc["id"],
......
...@@ -10,15 +10,9 @@ math, computer science, and philosophy papers. ...@@ -10,15 +10,9 @@ math, computer science, and philosophy papers.
Homepage: https://pile.eleuther.ai/ Homepage: https://pile.eleuther.ai/
""" """
import os import inspect
import lm_eval.datasets.pile.pile
import lm_dataformat from lm_eval.base import PerplexityTask
import abc
import numpy as np
from lm_eval.base import rf, PerplexityTask
from ..metrics import mean, matthews_corrcoef, f1_score
from ..utils import general_detokenize
from best_download import download_file
_CITATION = """ _CITATION = """
...@@ -31,32 +25,10 @@ _CITATION = """ ...@@ -31,32 +25,10 @@ _CITATION = """
""" """
class PilePerplexityTask(PerplexityTask, abc.ABC): class PilePerplexityTask(PerplexityTask):
VERSION = 1 VERSION = 1
DATASET_PATH = inspect.getfile(lm_eval.datasets.pile.pile)
PILE_SET_NAME = None DATASET_NAME = None
VAL_PATH = 'data/pile/val.jsonl.zst'
TEST_PATH = 'data/pile/test.jsonl.zst'
def download(self):
# TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
if not os.path.exists("data/pile/test.jsonl.zst"):
# todo use new best_download fallback api
os.makedirs("data/pile/", exist_ok=True)
download_file("http://eaidata.bmk.sh/data/pile/val.jsonl.zst", local_file=self.VAL_PATH, expected_checksum="264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
download_file("http://eaidata.bmk.sh/data/pile/test.jsonl.zst", local_file=self.TEST_PATH, expected_checksum="0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
def validation_docs(self):
rdr = lm_dataformat.Reader(self.VAL_PATH)
for doc, metadata in rdr.stream_data(get_meta=True):
if metadata["pile_set_name"] == self.PILE_SET_NAME:
yield doc
def test_docs(self):
rdr = lm_dataformat.Reader(self.TEST_PATH)
for doc, metadata in rdr.stream_data(get_meta=True):
if metadata["pile_set_name"] == self.PILE_SET_NAME:
yield doc
def has_validation_docs(self): def has_validation_docs(self):
return True return True
...@@ -64,90 +36,98 @@ class PilePerplexityTask(PerplexityTask, abc.ABC): ...@@ -64,90 +36,98 @@ class PilePerplexityTask(PerplexityTask, abc.ABC):
def has_test_docs(self): def has_test_docs(self):
return True return True
def validation_docs(self):
for doc in self.dataset["validation"]:
yield doc["text"]
def test_docs(self):
for doc in self.dataset["test"]:
yield doc["text"]
class PileArxiv(PilePerplexityTask): class PileArxiv(PilePerplexityTask):
PILE_SET_NAME = "ArXiv" DATASET_NAME = "pile_arxiv"
class PileBooks3(PilePerplexityTask): class PileBooks3(PilePerplexityTask):
PILE_SET_NAME = "Books3" DATASET_NAME = "pile_books3"
class PileBookCorpus2(PilePerplexityTask): class PileBookCorpus2(PilePerplexityTask):
PILE_SET_NAME = "BookCorpus2" DATASET_NAME = "pile_bookcorpus2"
class PileDmMathematics(PilePerplexityTask): class PileDmMathematics(PilePerplexityTask):
PILE_SET_NAME = "DM Mathematics" DATASET_NAME = "pile_dm-mathematics"
class PileEnron(PilePerplexityTask): class PileEnron(PilePerplexityTask):
PILE_SET_NAME = "Enron Emails" DATASET_NAME = "pile_enron"
class PileEuroparl(PilePerplexityTask): class PileEuroparl(PilePerplexityTask):
PILE_SET_NAME = "EuroParl" DATASET_NAME = "pile_europarl"
class PileFreeLaw(PilePerplexityTask): class PileFreeLaw(PilePerplexityTask):
PILE_SET_NAME = "FreeLaw" DATASET_NAME = "pile_freelaw"
class PileGithub(PilePerplexityTask): class PileGithub(PilePerplexityTask):
PILE_SET_NAME = "Github" DATASET_NAME = "pile_github"
class PileGutenberg(PilePerplexityTask): class PileGutenberg(PilePerplexityTask):
PILE_SET_NAME = "Gutenberg (PG-19)" DATASET_NAME = "pile_gutenberg"
class PileHackernews(PilePerplexityTask): class PileHackernews(PilePerplexityTask):
PILE_SET_NAME = "HackerNews" DATASET_NAME = "pile_hackernews"
class PileNIHExporter(PilePerplexityTask): class PileNIHExporter(PilePerplexityTask):
PILE_SET_NAME = "NIH ExPorter" DATASET_NAME = "pile_nih-exporter"
class PileOpenSubtitles(PilePerplexityTask): class PileOpenSubtitles(PilePerplexityTask):
PILE_SET_NAME = "OpenSubtitles" DATASET_NAME = "pile_opensubtitles"
class PileOpenWebText2(PilePerplexityTask): class PileOpenWebText2(PilePerplexityTask):
PILE_SET_NAME = "OpenWebText2" DATASET_NAME = "pile_openwebtext2"
class PilePhilPapers(PilePerplexityTask): class PilePhilPapers(PilePerplexityTask):
PILE_SET_NAME = "PhilPapers" DATASET_NAME = "pile_philpapers"
class PilePileCc(PilePerplexityTask): class PilePileCc(PilePerplexityTask):
PILE_SET_NAME = "Pile-CC" DATASET_NAME = "pile_pile-cc"
class PilePubmedAbstracts(PilePerplexityTask): class PilePubmedAbstracts(PilePerplexityTask):
PILE_SET_NAME = "PubMed Abstracts" DATASET_NAME = "pile_pubmed-abstracts"
class PilePubmedCentral(PilePerplexityTask): class PilePubmedCentral(PilePerplexityTask):
PILE_SET_NAME = "PubMed Central" DATASET_NAME = "pile_pubmed-central"
class PileStackExchange(PilePerplexityTask): class PileStackExchange(PilePerplexityTask):
PILE_SET_NAME = "StackExchange" DATASET_NAME = "pile_stackexchange"
class PileUspto(PilePerplexityTask): class PileUspto(PilePerplexityTask):
PILE_SET_NAME = "USPTO Backgrounds" DATASET_NAME = "pile_upsto"
class PileUbuntuIrc(PilePerplexityTask): class PileUbuntuIrc(PilePerplexityTask):
PILE_SET_NAME = "Ubuntu IRC" DATASET_NAME = "pile_ubuntu-irc"
class PileWikipedia(PilePerplexityTask): class PileWikipedia(PilePerplexityTask):
PILE_SET_NAME = "Wikipedia (en)" DATASET_NAME = "pile_wikipedia"
class PileYoutubeSubtitles(PilePerplexityTask): class PileYoutubeSubtitles(PilePerplexityTask):
PILE_SET_NAME = "YoutubeSubtitles" DATASET_NAME = "pile_youtubesubtitles"
...@@ -9,10 +9,7 @@ actually learning about the world? ...@@ -9,10 +9,7 @@ actually learning about the world?
Homepage: https://yonatanbisk.com/piqa/ Homepage: https://yonatanbisk.com/piqa/
""" """
import numpy as np from lm_eval.base import MultipleChoiceTask
from lm_eval.base import MultipleChoiceTask, rf
from ..metrics import mean
from . common import HFTask
_CITATION = """ _CITATION = """
...@@ -29,7 +26,7 @@ _CITATION = """ ...@@ -29,7 +26,7 @@ _CITATION = """
""" """
class PiQA(HFTask, MultipleChoiceTask): class PiQA(MultipleChoiceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "piqa" DATASET_PATH = "piqa"
DATASET_NAME = None DATASET_NAME = None
...@@ -43,6 +40,14 @@ class PiQA(HFTask, MultipleChoiceTask): ...@@ -43,6 +40,14 @@ class PiQA(HFTask, MultipleChoiceTask):
def has_test_docs(self): def has_test_docs(self):
return False return False
def training_docs(self):
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return map(self._convert_standard, self._training_docs)
def validation_docs(self):
return map(self._convert_standard, self.dataset["validation"])
def _convert_standard(self, doc): def _convert_standard(self, doc):
out_doc = { out_doc = {
"goal": doc["goal"], "goal": doc["goal"],
......
...@@ -15,7 +15,6 @@ have been trained on data not specifically collected to succeed on PROST." ...@@ -15,7 +15,6 @@ have been trained on data not specifically collected to succeed on PROST."
Homepage: https://github.com/nala-cub/prost Homepage: https://github.com/nala-cub/prost
""" """
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
from . common import HFTask
_CITATION = """ _CITATION = """
...@@ -36,7 +35,7 @@ _CITATION = """ ...@@ -36,7 +35,7 @@ _CITATION = """
""" """
class PROST(HFTask, MultipleChoiceTask): class PROST(MultipleChoiceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "corypaik/prost" DATASET_PATH = "corypaik/prost"
DATASET_NAME = None DATASET_NAME = None
...@@ -50,6 +49,9 @@ class PROST(HFTask, MultipleChoiceTask): ...@@ -50,6 +49,9 @@ class PROST(HFTask, MultipleChoiceTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def test_docs(self):
return map(self._convert_standard, self.dataset["test"])
def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None): def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
assert num_fewshot == 0, 'PROST is designed to probe models in a zero-shot fashion only.' assert num_fewshot == 0, 'PROST is designed to probe models in a zero-shot fashion only.'
return super().fewshot_context( return super().fewshot_context(
......
...@@ -16,9 +16,8 @@ and (4) a yes/no/maybe answer which summarizes the conclusion. ...@@ -16,9 +16,8 @@ and (4) a yes/no/maybe answer which summarizes the conclusion.
Homepage: https://pubmedqa.github.io/ Homepage: https://pubmedqa.github.io/
""" """
import numpy as np import numpy as np
from .common import HFTask from lm_eval.base import rf, Task
from lm_eval.base import rf from lm_eval.metrics import mean
from ..metrics import mean
_CITATION = """ _CITATION = """
...@@ -32,7 +31,7 @@ _CITATION = """ ...@@ -32,7 +31,7 @@ _CITATION = """
""" """
class Pubmed_QA(HFTask): class Pubmed_QA(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = "pubmed_qa" DATASET_PATH = "pubmed_qa"
DATASET_NAME = "pqa_labeled" DATASET_NAME = "pqa_labeled"
...@@ -49,7 +48,7 @@ class Pubmed_QA(HFTask): ...@@ -49,7 +48,7 @@ class Pubmed_QA(HFTask):
def test_docs(self): def test_docs(self):
if self.has_test_docs(): if self.has_test_docs():
# HF is labelled as train but its really just for testing # HF is labelled as train but its really just for testing
return self.data["train"] return self.dataset["train"]
def doc_to_text(self, doc): def doc_to_text(self, doc):
ctxs = "\n".join(doc["context"]["contexts"]) ctxs = "\n".join(doc["context"]["contexts"])
......
...@@ -13,9 +13,6 @@ and Entrance Exam. ...@@ -13,9 +13,6 @@ and Entrance Exam.
Homepage: http://nlp.uned.es/clef-qa/repository/qa4mre.php Homepage: http://nlp.uned.es/clef-qa/repository/qa4mre.php
""" """
import os
import xml.etree.ElementTree as ET
from best_download import download_file
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
...@@ -31,35 +28,8 @@ _CITATION = """ ...@@ -31,35 +28,8 @@ _CITATION = """
class QA4MRE(MultipleChoiceTask): class QA4MRE(MultipleChoiceTask):
VERSION = 0 VERSION = 0
YEAR = None DATASET_PATH = "qa4mre"
def download(self): DATASET_NAME = None
year = self.YEAR
lang = "EN"
base_path = (
"http://nlp.uned.es/clef-qa/repository/js/scripts/downloadFile.php?"
"file=/var/www/html/nlp/clef-qa/repository/resources/QA4MRE/"
)
# TODO: add side tasks?
variable_year_path = {
2011: '2011/Training_Data/Goldstandard/',
2012: '2012/Main_Task/Training_Data/Goldstandard/Used_in_Evaluation/',
2013: '2013/Main_Task/Training_Data/Goldstandard/'
}
sha256sums = {
2011 : "6d2524952a3a015f2a82df785b85b5578681e3602ec276b4e72c01f4ebc50034",
2012 : "f9edaf408f8ac93f89a643a0d0b19263a1bb5ce64f19b2af10df279a656dfb24",
2013 : "c60e5aa4ec77e0493ef0b11d46bd1d74d58a499a3a2f871b8cf3af9536f0f094",
}
vpath = variable_year_path[year]
url_path = f"{base_path}{vpath}QA4MRE-{year}-{lang}_GS.xml"
if not os.path.exists("data/qa4mre"):
os.makedirs("data/qa4mre", exist_ok=True)
if not os.path.isfile(f"data/qa4mre/QA4MRE-{year}-{lang}"):
download_file(
url_path,
local_file=f"data/qa4mre/QA4MRE-{year}-{lang}_GS.xml",
expected_checksum=sha256sums[year],
)
def has_training_docs(self): def has_training_docs(self):
return False return False
...@@ -70,39 +40,31 @@ class QA4MRE(MultipleChoiceTask): ...@@ -70,39 +40,31 @@ class QA4MRE(MultipleChoiceTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def _convert_standard(self, question): def test_docs(self):
choices = [i.text for i in question.iter('answer')] # `qa4mre` only has train data so we use it for the test docs.
return map(self._convert_standard, self.dataset["train"])
def _convert_standard(self, doc):
choices = doc["answer_options"]["answer_str"]
out_doc = { out_doc = {
"query" : question.find('q_str').text, "source": doc["document_str"].strip().replace("\'", "'"),
"query": doc["question_str"],
"choices": choices, "choices": choices,
"gold" : int(question.find("./answer[@correct='Yes']").attrib["a_id"]) - 1, "gold": int(doc["correct_answer_id"]) - 1,
} }
return out_doc return out_doc
def load_docs(self, textfilename, tfds=False):
tree = ET.parse(textfilename)
root = tree.getroot()
# TODO: context is much larger than the context sometimes
# at the moment, it just gets left-truncated by LM automatically, and maybe that's good enough?
for reading_test in root.iter('reading-test'):
src = reading_test[0].text
src = src.strip().replace("\'", "'")
for qid, question in enumerate(reading_test.iter('q')):
out_doc = self._convert_standard(question)
out_doc['source'] = src
yield out_doc
def test_docs(self):
return self.load_docs(f"data/qa4mre/QA4MRE-{self.YEAR}-EN_GS.xml")
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]) return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"])
class QA4MRE_2011(QA4MRE): class QA4MRE_2011(QA4MRE):
YEAR = 2011 DATASET_NAME = "2011.main.EN"
class QA4MRE_2012(QA4MRE): class QA4MRE_2012(QA4MRE):
YEAR = 2012 DATASET_NAME = "2012.main.EN"
class QA4MRE_2013(QA4MRE): class QA4MRE_2013(QA4MRE):
YEAR = 2013 DATASET_NAME = "2013.main.EN"
...@@ -11,13 +11,10 @@ provide supporting evidence to answers. ...@@ -11,13 +11,10 @@ provide supporting evidence to answers.
Homepage: https://allenai.org/data/qasper Homepage: https://allenai.org/data/qasper
""" """
from collections import Counter from collections import Counter
from math import exp
import random
import re import re
import string import string
from lm_eval.base import rf from lm_eval.base import rf, Task
from lm_eval.metrics import f1_score, mean from lm_eval.metrics import f1_score, mean
from .common import HFTask
_CITATION = """ _CITATION = """
...@@ -104,11 +101,20 @@ def token_f1_score(prediction, ground_truth): ...@@ -104,11 +101,20 @@ def token_f1_score(prediction, ground_truth):
return f1 return f1
class QASPER(HFTask): class QASPER(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = "qasper" DATASET_PATH = "qasper"
DATASET_NAME = None DATASET_NAME = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def doc_to_text(self, doc): def doc_to_text(self, doc):
return ( return (
"TITLE: " "TITLE: "
...@@ -130,11 +136,11 @@ class QASPER(HFTask): ...@@ -130,11 +136,11 @@ class QASPER(HFTask):
return " " + answer return " " + answer
def training_docs(self): def training_docs(self):
for doc in self.data["train"]: for doc in self.dataset["train"]:
yield from self.process_doc(doc) yield from self.process_doc(doc)
def validation_docs(self): def validation_docs(self):
for doc in self.data["train"]: for doc in self.dataset["validation"]:
yield from self.process_doc(doc) yield from self.process_doc(doc)
def process_doc(self, doc): def process_doc(self, doc):
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment