Unverified Commit 6caa0afd authored by Leo Gao's avatar Leo Gao Committed by GitHub
Browse files

Merge pull request #300 from jon-tow/hf-dataset-refactor

Refactor `Task` downloading to use `HuggingFace.datasets`
parents 7064d6b9 9434722c
...@@ -10,7 +10,6 @@ Homepage: https://math-qa.github.io/math-QA/ ...@@ -10,7 +10,6 @@ Homepage: https://math-qa.github.io/math-QA/
""" """
import re import re
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
from . common import HFTask
_CITATION = """ _CITATION = """
...@@ -25,7 +24,7 @@ _CITATION = """ ...@@ -25,7 +24,7 @@ _CITATION = """
""" """
class MathQA(HFTask, MultipleChoiceTask): class MathQA(MultipleChoiceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "math_qa" DATASET_PATH = "math_qa"
DATASET_NAME = None DATASET_NAME = None
...@@ -39,13 +38,23 @@ class MathQA(HFTask, MultipleChoiceTask): ...@@ -39,13 +38,23 @@ class MathQA(HFTask, MultipleChoiceTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def _convert_standard(self, doc): def training_docs(self):
if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs
def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])
def test_docs(self):
return map(self._process_doc, self.dataset["test"])
def _process_doc(self, doc):
answer_idx = ['a', 'b', 'c', 'd', 'e'].index(doc['correct']) answer_idx = ['a', 'b', 'c', 'd', 'e'].index(doc['correct'])
choices = [c[4:].rstrip(" ,") for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc['options'])] choices = [c[4:].rstrip(" ,") for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc['options'])]
out_doc = { out_doc = {
"query": "Question: " + doc['Problem'] +"\nAnswer:", "query": "Question: " + doc['Problem'] + "\nAnswer:",
"choices": choices, "choices": choices,
"gold": answer_idx, "gold": answer_idx,
} }
......
...@@ -20,9 +20,8 @@ of a question's options. See section 4 of the paper for details. ...@@ -20,9 +20,8 @@ of a question's options. See section 4 of the paper for details.
Homepage: https://leaderboard.allenai.org/mctaco/submissions/public Homepage: https://leaderboard.allenai.org/mctaco/submissions/public
""" """
import numpy as np import numpy as np
from lm_eval.base import rf
from collections import defaultdict from collections import defaultdict
from . common import HFTask from lm_eval.base import rf, Task
_CITATION = """ _CITATION = """
...@@ -35,7 +34,7 @@ _CITATION = """ ...@@ -35,7 +34,7 @@ _CITATION = """
""" """
class MCTACO(HFTask): class MCTACO(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = "mc_taco" DATASET_PATH = "mc_taco"
DATASET_NAME = None DATASET_NAME = None
...@@ -49,6 +48,12 @@ class MCTACO(HFTask): ...@@ -49,6 +48,12 @@ class MCTACO(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def validation_docs(self):
return self.dataset["validation"]
def test_docs(self):
return self.dataset["test"]
def doc_to_text(self, doc): def doc_to_text(self, doc):
return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\ return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
f"Answer: {doc['answer']}\nPlausible:" f"Answer: {doc['answer']}\nPlausible:"
......
...@@ -7,14 +7,11 @@ modified from Chinese high school English listening comprehension test data. ...@@ -7,14 +7,11 @@ modified from Chinese high school English listening comprehension test data.
Homepage: https://github.com/Nealcly/MuTual Homepage: https://github.com/Nealcly/MuTual
""" """
import json
import zipfile
import shutil
import numpy as np import numpy as np
from pathlib import Path import inspect
import lm_eval.datasets.mutual.mutual
from lm_eval.base import Task, rf from lm_eval.base import Task, rf
from lm_eval.metrics import mean from lm_eval.metrics import mean
from best_download import download_file
_CITATION = """ _CITATION = """
...@@ -30,29 +27,10 @@ _CITATION = """ ...@@ -30,29 +27,10 @@ _CITATION = """
class MuTualBase(Task): class MuTualBase(Task):
VERSION = 1 VERSION = 1
BASE_PATH = Path("data/mutual") DATASET_PATH = inspect.getfile(lm_eval.datasets.mutual.mutual)
DATASET_NAME = None DATASET_NAME = None
CHOICES = ['A', 'B', 'C', 'D'] CHOICES = ['A', 'B', 'C', 'D']
def __init__(self):
super().__init__()
def download(self):
if self.BASE_PATH.exists():
return
Path.mkdir(self.BASE_PATH, parents=True)
master_zip = Path("data/master.zip")
download_file(
"https://github.com/Nealcly/MuTual/archive/master.zip",
local_file=str(master_zip),
expected_checksum="bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9")
with zipfile.ZipFile(master_zip, 'r') as zip:
zip.extractall("data")
Path("data/MuTual-master/data").rename(str(self.BASE_PATH))
# Remove left over files and directories.
master_zip.unlink()
shutil.rmtree("data/MuTual-master")
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -62,18 +40,11 @@ class MuTualBase(Task): ...@@ -62,18 +40,11 @@ class MuTualBase(Task):
def has_test_docs(self): def has_test_docs(self):
return False return False
def _load_docs(self, path):
for file in sorted(path.iterdir()):
if file.suffix != ".txt":
continue
with open(file, 'r', encoding='utf-8') as f:
yield json.load(f)
def training_docs(self): def training_docs(self):
return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "train") return self.dataset["train"]
def validation_docs(self): def validation_docs(self):
return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "dev") return self.dataset["validation"]
def test_docs(self): def test_docs(self):
return NotImplemented return NotImplemented
...@@ -134,8 +105,8 @@ class MuTualBase(Task): ...@@ -134,8 +105,8 @@ class MuTualBase(Task):
class MuTual(MuTualBase): class MuTual(MuTualBase):
DATASET_NAME = Path("mutual") DATASET_NAME = "mutual"
class MuTualPlus(MuTualBase): class MuTualPlus(MuTualBase):
DATASET_NAME = Path("mutual_plus") DATASET_NAME = "mutual_plus"
...@@ -15,8 +15,7 @@ not even bother with the train set. ...@@ -15,8 +15,7 @@ not even bother with the train set.
Homepage: https://ai.google.com/research/NaturalQuestions Homepage: https://ai.google.com/research/NaturalQuestions
""" """
import random from lm_eval.base import Task
from . common import HFTask
from itertools import islice from itertools import islice
...@@ -30,7 +29,7 @@ _CITATION = """ ...@@ -30,7 +29,7 @@ _CITATION = """
""" """
class NaturalQs(HFTask): class NaturalQs(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = "natural_questions" DATASET_PATH = "natural_questions"
DATASET_NAME = None DATASET_NAME = None
...@@ -47,7 +46,12 @@ class NaturalQs(HFTask): ...@@ -47,7 +46,12 @@ class NaturalQs(HFTask):
def training_docs(self): def training_docs(self):
# Cache training for faster few-shot. # Cache training for faster few-shot.
# Data is too large to fit in memory. # Data is too large to fit in memory.
return self.data["train"] if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
return self.dataset["validation"]
def fewshot_examples(self, k, rnd): def fewshot_examples(self, k, rnd):
# Data is too large to fit in memory. We just sample from the first bit. # Data is too large to fit in memory. We just sample from the first bit.
......
...@@ -15,7 +15,6 @@ based algorithm and a word co-occurrence algorithm. ...@@ -15,7 +15,6 @@ based algorithm and a word co-occurrence algorithm.
Homepage: https://allenai.org/data/open-book-qa Homepage: https://allenai.org/data/open-book-qa
""" """
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
from .common import HFTask
_CITATION = """ _CITATION = """
...@@ -28,7 +27,7 @@ _CITATION = """ ...@@ -28,7 +27,7 @@ _CITATION = """
""" """
class OpenBookQA(HFTask, MultipleChoiceTask): class OpenBookQA(MultipleChoiceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "openbookqa" DATASET_PATH = "openbookqa"
DATASET_NAME = "main" DATASET_NAME = "main"
...@@ -42,7 +41,18 @@ class OpenBookQA(HFTask, MultipleChoiceTask): ...@@ -42,7 +41,18 @@ class OpenBookQA(HFTask, MultipleChoiceTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def _convert_standard(self, doc): def training_docs(self):
if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs
def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])
def test_docs(self):
return map(self._process_doc, self.dataset["test"])
def _process_doc(self, doc):
out_doc = { out_doc = {
"id": doc["id"], "id": doc["id"],
"query": doc["question_stem"], "query": doc["question_stem"],
......
...@@ -10,15 +10,9 @@ math, computer science, and philosophy papers. ...@@ -10,15 +10,9 @@ math, computer science, and philosophy papers.
Homepage: https://pile.eleuther.ai/ Homepage: https://pile.eleuther.ai/
""" """
import os import inspect
import lm_eval.datasets.pile.pile
import lm_dataformat from lm_eval.base import PerplexityTask
import abc
import numpy as np
from lm_eval.base import rf, PerplexityTask
from ..metrics import mean, matthews_corrcoef, f1_score
from ..utils import general_detokenize
from best_download import download_file
_CITATION = """ _CITATION = """
...@@ -31,32 +25,10 @@ _CITATION = """ ...@@ -31,32 +25,10 @@ _CITATION = """
""" """
class PilePerplexityTask(PerplexityTask, abc.ABC): class PilePerplexityTask(PerplexityTask):
VERSION = 1 VERSION = 1
DATASET_PATH = inspect.getfile(lm_eval.datasets.pile.pile)
PILE_SET_NAME = None DATASET_NAME = None
VAL_PATH = 'data/pile/val.jsonl.zst'
TEST_PATH = 'data/pile/test.jsonl.zst'
def download(self):
# TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
if not os.path.exists("data/pile/test.jsonl.zst"):
# todo use new best_download fallback api
os.makedirs("data/pile/", exist_ok=True)
download_file("http://eaidata.bmk.sh/data/pile/val.jsonl.zst", local_file=self.VAL_PATH, expected_checksum="264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
download_file("http://eaidata.bmk.sh/data/pile/test.jsonl.zst", local_file=self.TEST_PATH, expected_checksum="0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
def validation_docs(self):
rdr = lm_dataformat.Reader(self.VAL_PATH)
for doc, metadata in rdr.stream_data(get_meta=True):
if metadata["pile_set_name"] == self.PILE_SET_NAME:
yield doc
def test_docs(self):
rdr = lm_dataformat.Reader(self.TEST_PATH)
for doc, metadata in rdr.stream_data(get_meta=True):
if metadata["pile_set_name"] == self.PILE_SET_NAME:
yield doc
def has_validation_docs(self): def has_validation_docs(self):
return True return True
...@@ -64,90 +36,98 @@ class PilePerplexityTask(PerplexityTask, abc.ABC): ...@@ -64,90 +36,98 @@ class PilePerplexityTask(PerplexityTask, abc.ABC):
def has_test_docs(self): def has_test_docs(self):
return True return True
def validation_docs(self):
for doc in self.dataset["validation"]:
yield doc["text"]
def test_docs(self):
for doc in self.dataset["test"]:
yield doc["text"]
class PileArxiv(PilePerplexityTask): class PileArxiv(PilePerplexityTask):
PILE_SET_NAME = "ArXiv" DATASET_NAME = "pile_arxiv"
class PileBooks3(PilePerplexityTask): class PileBooks3(PilePerplexityTask):
PILE_SET_NAME = "Books3" DATASET_NAME = "pile_books3"
class PileBookCorpus2(PilePerplexityTask): class PileBookCorpus2(PilePerplexityTask):
PILE_SET_NAME = "BookCorpus2" DATASET_NAME = "pile_bookcorpus2"
class PileDmMathematics(PilePerplexityTask): class PileDmMathematics(PilePerplexityTask):
PILE_SET_NAME = "DM Mathematics" DATASET_NAME = "pile_dm-mathematics"
class PileEnron(PilePerplexityTask): class PileEnron(PilePerplexityTask):
PILE_SET_NAME = "Enron Emails" DATASET_NAME = "pile_enron"
class PileEuroparl(PilePerplexityTask): class PileEuroparl(PilePerplexityTask):
PILE_SET_NAME = "EuroParl" DATASET_NAME = "pile_europarl"
class PileFreeLaw(PilePerplexityTask): class PileFreeLaw(PilePerplexityTask):
PILE_SET_NAME = "FreeLaw" DATASET_NAME = "pile_freelaw"
class PileGithub(PilePerplexityTask): class PileGithub(PilePerplexityTask):
PILE_SET_NAME = "Github" DATASET_NAME = "pile_github"
class PileGutenberg(PilePerplexityTask): class PileGutenberg(PilePerplexityTask):
PILE_SET_NAME = "Gutenberg (PG-19)" DATASET_NAME = "pile_gutenberg"
class PileHackernews(PilePerplexityTask): class PileHackernews(PilePerplexityTask):
PILE_SET_NAME = "HackerNews" DATASET_NAME = "pile_hackernews"
class PileNIHExporter(PilePerplexityTask): class PileNIHExporter(PilePerplexityTask):
PILE_SET_NAME = "NIH ExPorter" DATASET_NAME = "pile_nih-exporter"
class PileOpenSubtitles(PilePerplexityTask): class PileOpenSubtitles(PilePerplexityTask):
PILE_SET_NAME = "OpenSubtitles" DATASET_NAME = "pile_opensubtitles"
class PileOpenWebText2(PilePerplexityTask): class PileOpenWebText2(PilePerplexityTask):
PILE_SET_NAME = "OpenWebText2" DATASET_NAME = "pile_openwebtext2"
class PilePhilPapers(PilePerplexityTask): class PilePhilPapers(PilePerplexityTask):
PILE_SET_NAME = "PhilPapers" DATASET_NAME = "pile_philpapers"
class PilePileCc(PilePerplexityTask): class PilePileCc(PilePerplexityTask):
PILE_SET_NAME = "Pile-CC" DATASET_NAME = "pile_pile-cc"
class PilePubmedAbstracts(PilePerplexityTask): class PilePubmedAbstracts(PilePerplexityTask):
PILE_SET_NAME = "PubMed Abstracts" DATASET_NAME = "pile_pubmed-abstracts"
class PilePubmedCentral(PilePerplexityTask): class PilePubmedCentral(PilePerplexityTask):
PILE_SET_NAME = "PubMed Central" DATASET_NAME = "pile_pubmed-central"
class PileStackExchange(PilePerplexityTask): class PileStackExchange(PilePerplexityTask):
PILE_SET_NAME = "StackExchange" DATASET_NAME = "pile_stackexchange"
class PileUspto(PilePerplexityTask): class PileUspto(PilePerplexityTask):
PILE_SET_NAME = "USPTO Backgrounds" DATASET_NAME = "pile_upsto"
class PileUbuntuIrc(PilePerplexityTask): class PileUbuntuIrc(PilePerplexityTask):
PILE_SET_NAME = "Ubuntu IRC" DATASET_NAME = "pile_ubuntu-irc"
class PileWikipedia(PilePerplexityTask): class PileWikipedia(PilePerplexityTask):
PILE_SET_NAME = "Wikipedia (en)" DATASET_NAME = "pile_wikipedia"
class PileYoutubeSubtitles(PilePerplexityTask): class PileYoutubeSubtitles(PilePerplexityTask):
PILE_SET_NAME = "YoutubeSubtitles" DATASET_NAME = "pile_youtubesubtitles"
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment