Unverified Commit 6caa0afd authored by Leo Gao's avatar Leo Gao Committed by GitHub
Browse files

Merge pull request #300 from jon-tow/hf-dataset-refactor

Refactor `Task` downloading to use `HuggingFace.datasets`
parents 7064d6b9 9434722c
......@@ -10,7 +10,6 @@ Homepage: https://math-qa.github.io/math-QA/
"""
import re
from lm_eval.base import MultipleChoiceTask
from . common import HFTask
_CITATION = """
......@@ -25,7 +24,7 @@ _CITATION = """
"""
class MathQA(HFTask, MultipleChoiceTask):
class MathQA(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "math_qa"
DATASET_NAME = None
......@@ -39,13 +38,23 @@ class MathQA(HFTask, MultipleChoiceTask):
def has_test_docs(self):
return True
def _convert_standard(self, doc):
def training_docs(self):
if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs
def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])
def test_docs(self):
return map(self._process_doc, self.dataset["test"])
def _process_doc(self, doc):
answer_idx = ['a', 'b', 'c', 'd', 'e'].index(doc['correct'])
choices = [c[4:].rstrip(" ,") for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc['options'])]
out_doc = {
"query": "Question: " + doc['Problem'] +"\nAnswer:",
"query": "Question: " + doc['Problem'] + "\nAnswer:",
"choices": choices,
"gold": answer_idx,
}
......
......@@ -20,9 +20,8 @@ of a question's options. See section 4 of the paper for details.
Homepage: https://leaderboard.allenai.org/mctaco/submissions/public
"""
import numpy as np
from lm_eval.base import rf
from collections import defaultdict
from . common import HFTask
from lm_eval.base import rf, Task
_CITATION = """
......@@ -35,7 +34,7 @@ _CITATION = """
"""
class MCTACO(HFTask):
class MCTACO(Task):
VERSION = 0
DATASET_PATH = "mc_taco"
DATASET_NAME = None
......@@ -49,6 +48,12 @@ class MCTACO(HFTask):
def has_test_docs(self):
return True
def validation_docs(self):
return self.dataset["validation"]
def test_docs(self):
return self.dataset["test"]
def doc_to_text(self, doc):
return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
f"Answer: {doc['answer']}\nPlausible:"
......
......@@ -7,14 +7,11 @@ modified from Chinese high school English listening comprehension test data.
Homepage: https://github.com/Nealcly/MuTual
"""
import json
import zipfile
import shutil
import numpy as np
from pathlib import Path
import inspect
import lm_eval.datasets.mutual.mutual
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
from best_download import download_file
_CITATION = """
......@@ -30,29 +27,10 @@ _CITATION = """
class MuTualBase(Task):
VERSION = 1
BASE_PATH = Path("data/mutual")
DATASET_PATH = inspect.getfile(lm_eval.datasets.mutual.mutual)
DATASET_NAME = None
CHOICES = ['A', 'B', 'C', 'D']
def __init__(self):
super().__init__()
def download(self):
if self.BASE_PATH.exists():
return
Path.mkdir(self.BASE_PATH, parents=True)
master_zip = Path("data/master.zip")
download_file(
"https://github.com/Nealcly/MuTual/archive/master.zip",
local_file=str(master_zip),
expected_checksum="bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9")
with zipfile.ZipFile(master_zip, 'r') as zip:
zip.extractall("data")
Path("data/MuTual-master/data").rename(str(self.BASE_PATH))
# Remove left over files and directories.
master_zip.unlink()
shutil.rmtree("data/MuTual-master")
def has_training_docs(self):
return True
......@@ -62,18 +40,11 @@ class MuTualBase(Task):
def has_test_docs(self):
return False
def _load_docs(self, path):
for file in sorted(path.iterdir()):
if file.suffix != ".txt":
continue
with open(file, 'r', encoding='utf-8') as f:
yield json.load(f)
def training_docs(self):
return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "train")
return self.dataset["train"]
def validation_docs(self):
return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "dev")
return self.dataset["validation"]
def test_docs(self):
return NotImplemented
......@@ -134,8 +105,8 @@ class MuTualBase(Task):
class MuTual(MuTualBase):
DATASET_NAME = Path("mutual")
DATASET_NAME = "mutual"
class MuTualPlus(MuTualBase):
DATASET_NAME = Path("mutual_plus")
DATASET_NAME = "mutual_plus"
......@@ -15,8 +15,7 @@ not even bother with the train set.
Homepage: https://ai.google.com/research/NaturalQuestions
"""
import random
from . common import HFTask
from lm_eval.base import Task
from itertools import islice
......@@ -30,7 +29,7 @@ _CITATION = """
"""
class NaturalQs(HFTask):
class NaturalQs(Task):
VERSION = 0
DATASET_PATH = "natural_questions"
DATASET_NAME = None
......@@ -47,7 +46,12 @@ class NaturalQs(HFTask):
def training_docs(self):
# Cache training for faster few-shot.
# Data is too large to fit in memory.
return self.data["train"]
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
return self.dataset["validation"]
def fewshot_examples(self, k, rnd):
# Data is too large to fit in memory. We just sample from the first bit.
......
......@@ -15,7 +15,6 @@ based algorithm and a word co-occurrence algorithm.
Homepage: https://allenai.org/data/open-book-qa
"""
from lm_eval.base import MultipleChoiceTask
from .common import HFTask
_CITATION = """
......@@ -28,7 +27,7 @@ _CITATION = """
"""
class OpenBookQA(HFTask, MultipleChoiceTask):
class OpenBookQA(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "openbookqa"
DATASET_NAME = "main"
......@@ -42,7 +41,18 @@ class OpenBookQA(HFTask, MultipleChoiceTask):
def has_test_docs(self):
return True
def _convert_standard(self, doc):
def training_docs(self):
if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs
def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])
def test_docs(self):
return map(self._process_doc, self.dataset["test"])
def _process_doc(self, doc):
out_doc = {
"id": doc["id"],
"query": doc["question_stem"],
......
......@@ -10,15 +10,9 @@ math, computer science, and philosophy papers.
Homepage: https://pile.eleuther.ai/
"""
import os
import lm_dataformat
import abc
import numpy as np
from lm_eval.base import rf, PerplexityTask
from ..metrics import mean, matthews_corrcoef, f1_score
from ..utils import general_detokenize
from best_download import download_file
import inspect
import lm_eval.datasets.pile.pile
from lm_eval.base import PerplexityTask
_CITATION = """
......@@ -31,32 +25,10 @@ _CITATION = """
"""
class PilePerplexityTask(PerplexityTask, abc.ABC):
class PilePerplexityTask(PerplexityTask):
VERSION = 1
PILE_SET_NAME = None
VAL_PATH = 'data/pile/val.jsonl.zst'
TEST_PATH = 'data/pile/test.jsonl.zst'
def download(self):
# TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
if not os.path.exists("data/pile/test.jsonl.zst"):
# todo use new best_download fallback api
os.makedirs("data/pile/", exist_ok=True)
download_file("http://eaidata.bmk.sh/data/pile/val.jsonl.zst", local_file=self.VAL_PATH, expected_checksum="264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
download_file("http://eaidata.bmk.sh/data/pile/test.jsonl.zst", local_file=self.TEST_PATH, expected_checksum="0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
def validation_docs(self):
rdr = lm_dataformat.Reader(self.VAL_PATH)
for doc, metadata in rdr.stream_data(get_meta=True):
if metadata["pile_set_name"] == self.PILE_SET_NAME:
yield doc
def test_docs(self):
rdr = lm_dataformat.Reader(self.TEST_PATH)
for doc, metadata in rdr.stream_data(get_meta=True):
if metadata["pile_set_name"] == self.PILE_SET_NAME:
yield doc
DATASET_PATH = inspect.getfile(lm_eval.datasets.pile.pile)
DATASET_NAME = None
def has_validation_docs(self):
return True
......@@ -64,90 +36,98 @@ class PilePerplexityTask(PerplexityTask, abc.ABC):
def has_test_docs(self):
return True
def validation_docs(self):
for doc in self.dataset["validation"]:
yield doc["text"]
def test_docs(self):
for doc in self.dataset["test"]:
yield doc["text"]
class PileArxiv(PilePerplexityTask):
PILE_SET_NAME = "ArXiv"
DATASET_NAME = "pile_arxiv"
class PileBooks3(PilePerplexityTask):
PILE_SET_NAME = "Books3"
DATASET_NAME = "pile_books3"
class PileBookCorpus2(PilePerplexityTask):
PILE_SET_NAME = "BookCorpus2"
DATASET_NAME = "pile_bookcorpus2"
class PileDmMathematics(PilePerplexityTask):
PILE_SET_NAME = "DM Mathematics"
DATASET_NAME = "pile_dm-mathematics"
class PileEnron(PilePerplexityTask):
PILE_SET_NAME = "Enron Emails"
DATASET_NAME = "pile_enron"
class PileEuroparl(PilePerplexityTask):
PILE_SET_NAME = "EuroParl"
DATASET_NAME = "pile_europarl"
class PileFreeLaw(PilePerplexityTask):
PILE_SET_NAME = "FreeLaw"
DATASET_NAME = "pile_freelaw"
class PileGithub(PilePerplexityTask):
PILE_SET_NAME = "Github"
DATASET_NAME = "pile_github"
class PileGutenberg(PilePerplexityTask):
PILE_SET_NAME = "Gutenberg (PG-19)"
DATASET_NAME = "pile_gutenberg"
class PileHackernews(PilePerplexityTask):
PILE_SET_NAME = "HackerNews"
DATASET_NAME = "pile_hackernews"
class PileNIHExporter(PilePerplexityTask):
PILE_SET_NAME = "NIH ExPorter"
DATASET_NAME = "pile_nih-exporter"
class PileOpenSubtitles(PilePerplexityTask):
PILE_SET_NAME = "OpenSubtitles"
DATASET_NAME = "pile_opensubtitles"
class PileOpenWebText2(PilePerplexityTask):
PILE_SET_NAME = "OpenWebText2"
DATASET_NAME = "pile_openwebtext2"
class PilePhilPapers(PilePerplexityTask):
PILE_SET_NAME = "PhilPapers"
DATASET_NAME = "pile_philpapers"
class PilePileCc(PilePerplexityTask):
PILE_SET_NAME = "Pile-CC"
DATASET_NAME = "pile_pile-cc"
class PilePubmedAbstracts(PilePerplexityTask):
PILE_SET_NAME = "PubMed Abstracts"
DATASET_NAME = "pile_pubmed-abstracts"
class PilePubmedCentral(PilePerplexityTask):
PILE_SET_NAME = "PubMed Central"
DATASET_NAME = "pile_pubmed-central"
class PileStackExchange(PilePerplexityTask):
PILE_SET_NAME = "StackExchange"
DATASET_NAME = "pile_stackexchange"
class PileUspto(PilePerplexityTask):
PILE_SET_NAME = "USPTO Backgrounds"
DATASET_NAME = "pile_upsto"
class PileUbuntuIrc(PilePerplexityTask):
PILE_SET_NAME = "Ubuntu IRC"
DATASET_NAME = "pile_ubuntu-irc"
class PileWikipedia(PilePerplexityTask):
PILE_SET_NAME = "Wikipedia (en)"
DATASET_NAME = "pile_wikipedia"
class PileYoutubeSubtitles(PilePerplexityTask):
PILE_SET_NAME = "YoutubeSubtitles"
DATASET_NAME = "pile_youtubesubtitles"
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment