Unverified Commit 4887d9d3 authored by Jonathan Tow's avatar Jonathan Tow Committed by GitHub
Browse files

Merge pull request #307 from Taekyoon/multilingual-ko-korsts

Add klue-sts task to eval Korean language task
parents 7064d6b9 4a1041c1
...@@ -14,13 +14,6 @@ in the broader discourse. ...@@ -14,13 +14,6 @@ in the broader discourse.
Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
""" """
from . import lambada from . import lambada
from lm_eval.base import Task, rf
from lm_eval.metrics import mean, perplexity
from lm_eval.utils import sh
from best_download import download_file
import json
from functools import partial
import os
_CITATION = """ _CITATION = """
...@@ -35,68 +28,37 @@ _CITATION = """ ...@@ -35,68 +28,37 @@ _CITATION = """
""" """
LANGS = ["en", "fr", "de", "it", "es"]
CHECKSUMS = {"en": "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226",
"fr": "941ec6a73dba7dc91c860bf493eb66a527cd430148827a4753a4535a046bf362",
"de": "51c6c1795894c46e88e4c104b5667f488efe79081fb34d746b82b8caa663865e",
"it": "86654237716702ab74f42855ae5a78455c1b0e50054a4593fb9c6fcf7fad0850",
"es": "ffd760026c647fb43c67ce1bc56fd527937304b348712dce33190ea6caba6f9c"
}
class MultilingualLAMBADA(lambada.LAMBADA): class MultilingualLAMBADA(lambada.LAMBADA):
VERSION = 0 VERSION = 0
def __init__(self, lang=None):
self.LANG = lang
super().__init__()
def download(self):
sh("mkdir -p data/lambada")
f = f"data/lambada/lambada_test_{self.LANG}.jsonl"
url = f"http://eaidata.bmk.sh/data/lambada_test_{self.LANG}.jsonl"
try:
if not os.path.exists(f):
download_file(
url,
local_file=f,
expected_checksum=CHECKSUMS[self.LANG]
)
except:
# fallback - for some reason best_download doesnt work all the time here
sh(f"wget {url} -O {f}")
sh(f'echo "{CHECKSUMS[self.LANG]} {f}" | sha256sum --check')
def validation_docs(self):
with open(f"data/lambada/lambada_test_{self.LANG}.jsonl") as fh:
for line in fh:
yield json.loads(line)
class MultilingualLAMBADAEN(MultilingualLAMBADA): class MultilingualLAMBADAEN(MultilingualLAMBADA):
def __init__(self): DATASET_NAME = 'en'
super().__init__('en')
class MultilingualLAMBADAFR(MultilingualLAMBADA): class MultilingualLAMBADAFR(MultilingualLAMBADA):
def __init__(self): DATASET_NAME = 'fr'
super().__init__('fr')
class MultilingualLAMBADADE(MultilingualLAMBADA): class MultilingualLAMBADADE(MultilingualLAMBADA):
def __init__(self): DATASET_NAME = 'de'
super().__init__('de')
class MultilingualLAMBADAIT(MultilingualLAMBADA): class MultilingualLAMBADAIT(MultilingualLAMBADA):
def __init__(self): DATASET_NAME = 'it'
super().__init__('it')
class MultilingualLAMBADAES(MultilingualLAMBADA): class MultilingualLAMBADAES(MultilingualLAMBADA):
def __init__(self): DATASET_NAME = 'es'
super().__init__('es')
LANG_CLASSES = [MultilingualLAMBADAEN, MultilingualLAMBADAFR,
MultilingualLAMBADADE, MultilingualLAMBADAIT,
MultilingualLAMBADAES]
LANG_CLASSES = [MultilingualLAMBADAEN, MultilingualLAMBADAFR, MultilingualLAMBADADE, MultilingualLAMBADAIT, MultilingualLAMBADAES]
def construct_tasks(): def construct_tasks():
tasks = {} tasks = {}
for lang, lang_class in zip(LANGS, LANG_CLASSES): for lang_class in LANG_CLASSES:
tasks[f"lambada_mt_{lang}"] = lang_class tasks[f"lambada_mt_{lang_class.DATASET_NAME}"] = lang_class
return tasks return tasks
...@@ -10,9 +10,9 @@ NLP setting. ...@@ -10,9 +10,9 @@ NLP setting.
Homepage: https://github.com/lgw863/LogiQA-dataset Homepage: https://github.com/lgw863/LogiQA-dataset
""" """
import inspect
import lm_eval.datasets.logiqa.logiqa
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
from best_download import download_file
from pathlib import Path
_CITATION = """ _CITATION = """
...@@ -29,21 +29,8 @@ _CITATION = """ ...@@ -29,21 +29,8 @@ _CITATION = """
class LogiQA(MultipleChoiceTask): class LogiQA(MultipleChoiceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = Path("data/logiqa") DATASET_PATH = inspect.getfile(lm_eval.datasets.logiqa.logiqa)
DATASET_NAME = None
def download(self):
if self.DATASET_PATH.exists():
return
Path.mkdir(self.DATASET_PATH, parents=True)
base_url = "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master"
splits = [
{"name": "Train", "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"},
{"name": "Eval", "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"},
{"name": "Test", "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}
]
for split in splits:
file = self.DATASET_PATH / f"{split['name']}.txt"
download_file(f"{base_url}/{split['name']}.txt", local_file=str(file), expected_checksum=split["checksum"])
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -54,7 +41,18 @@ class LogiQA(MultipleChoiceTask): ...@@ -54,7 +41,18 @@ class LogiQA(MultipleChoiceTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def _convert_standard(self, doc): def training_docs(self):
if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs
def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])
def test_docs(self):
return map(self._process_doc, self.dataset["test"])
def _process_doc(self, doc):
def format_example(doc, choices): def format_example(doc, choices):
""" """
Passage: <passage> Passage: <passage>
...@@ -66,7 +64,7 @@ class LogiQA(MultipleChoiceTask): ...@@ -66,7 +64,7 @@ class LogiQA(MultipleChoiceTask):
D. <choice4> D. <choice4>
Answer: Answer:
""" """
prompt = "Passage: " + doc["passage"] + "\n" prompt = "Passage: " + doc["context"] + "\n"
prompt += "Question: " + doc["question"] + "\nChoices:\n" prompt += "Question: " + doc["question"] + "\nChoices:\n"
for choice, option in zip(choices, doc["options"]): for choice, option in zip(choices, doc["options"]):
prompt += f"{choice.upper()}. {option}\n" prompt += f"{choice.upper()}. {option}\n"
...@@ -76,33 +74,8 @@ class LogiQA(MultipleChoiceTask): ...@@ -76,33 +74,8 @@ class LogiQA(MultipleChoiceTask):
return { return {
"query": format_example(doc, choices), "query": format_example(doc, choices),
"choices": doc["options"], "choices": doc["options"],
"gold": choices.index(doc["answerKey"]) "gold": choices.index(doc["label"])
} }
def _load_docs(self, filename):
def normalize(text):
return text.replace(".", ". ").strip()
with open(filename, 'r') as f:
docs = f.read().strip().split("\n\n")
for rawdoc in docs:
rawdoc = rawdoc.split("\n")
doc = {
"answerKey": rawdoc[0].strip(),
"passage": normalize(rawdoc[1]),
"question": normalize(rawdoc[2]),
"options": [normalize(option[2:]) for option in rawdoc[3:]]
}
yield self._convert_standard(doc)
def training_docs(self):
return self._load_docs(self.DATASET_PATH / "Train.txt")
def validation_docs(self):
return self._load_docs(self.DATASET_PATH / "Eval.txt")
def test_docs(self):
return self._load_docs(self.DATASET_PATH / "Test.txt")
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
...@@ -10,7 +10,6 @@ Homepage: https://math-qa.github.io/math-QA/ ...@@ -10,7 +10,6 @@ Homepage: https://math-qa.github.io/math-QA/
""" """
import re import re
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
from . common import HFTask
_CITATION = """ _CITATION = """
...@@ -25,7 +24,7 @@ _CITATION = """ ...@@ -25,7 +24,7 @@ _CITATION = """
""" """
class MathQA(HFTask, MultipleChoiceTask): class MathQA(MultipleChoiceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "math_qa" DATASET_PATH = "math_qa"
DATASET_NAME = None DATASET_NAME = None
...@@ -39,13 +38,23 @@ class MathQA(HFTask, MultipleChoiceTask): ...@@ -39,13 +38,23 @@ class MathQA(HFTask, MultipleChoiceTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def _convert_standard(self, doc): def training_docs(self):
if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs
def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])
def test_docs(self):
return map(self._process_doc, self.dataset["test"])
def _process_doc(self, doc):
answer_idx = ['a', 'b', 'c', 'd', 'e'].index(doc['correct']) answer_idx = ['a', 'b', 'c', 'd', 'e'].index(doc['correct'])
choices = [c[4:].rstrip(" ,") for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc['options'])] choices = [c[4:].rstrip(" ,") for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc['options'])]
out_doc = { out_doc = {
"query": "Question: " + doc['Problem'] +"\nAnswer:", "query": "Question: " + doc['Problem'] + "\nAnswer:",
"choices": choices, "choices": choices,
"gold": answer_idx, "gold": answer_idx,
} }
......
...@@ -20,9 +20,8 @@ of a question's options. See section 4 of the paper for details. ...@@ -20,9 +20,8 @@ of a question's options. See section 4 of the paper for details.
Homepage: https://leaderboard.allenai.org/mctaco/submissions/public Homepage: https://leaderboard.allenai.org/mctaco/submissions/public
""" """
import numpy as np import numpy as np
from lm_eval.base import rf
from collections import defaultdict from collections import defaultdict
from . common import HFTask from lm_eval.base import rf, Task
_CITATION = """ _CITATION = """
...@@ -35,7 +34,7 @@ _CITATION = """ ...@@ -35,7 +34,7 @@ _CITATION = """
""" """
class MCTACO(HFTask): class MCTACO(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = "mc_taco" DATASET_PATH = "mc_taco"
DATASET_NAME = None DATASET_NAME = None
...@@ -49,6 +48,12 @@ class MCTACO(HFTask): ...@@ -49,6 +48,12 @@ class MCTACO(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def validation_docs(self):
return self.dataset["validation"]
def test_docs(self):
return self.dataset["test"]
def doc_to_text(self, doc): def doc_to_text(self, doc):
return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\ return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
f"Answer: {doc['answer']}\nPlausible:" f"Answer: {doc['answer']}\nPlausible:"
......
...@@ -7,14 +7,11 @@ modified from Chinese high school English listening comprehension test data. ...@@ -7,14 +7,11 @@ modified from Chinese high school English listening comprehension test data.
Homepage: https://github.com/Nealcly/MuTual Homepage: https://github.com/Nealcly/MuTual
""" """
import json
import zipfile
import shutil
import numpy as np import numpy as np
from pathlib import Path import inspect
import lm_eval.datasets.mutual.mutual
from lm_eval.base import Task, rf from lm_eval.base import Task, rf
from lm_eval.metrics import mean from lm_eval.metrics import mean
from best_download import download_file
_CITATION = """ _CITATION = """
...@@ -30,29 +27,10 @@ _CITATION = """ ...@@ -30,29 +27,10 @@ _CITATION = """
class MuTualBase(Task): class MuTualBase(Task):
VERSION = 1 VERSION = 1
BASE_PATH = Path("data/mutual") DATASET_PATH = inspect.getfile(lm_eval.datasets.mutual.mutual)
DATASET_NAME = None DATASET_NAME = None
CHOICES = ['A', 'B', 'C', 'D'] CHOICES = ['A', 'B', 'C', 'D']
def __init__(self):
super().__init__()
def download(self):
if self.BASE_PATH.exists():
return
Path.mkdir(self.BASE_PATH, parents=True)
master_zip = Path("data/master.zip")
download_file(
"https://github.com/Nealcly/MuTual/archive/master.zip",
local_file=str(master_zip),
expected_checksum="bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9")
with zipfile.ZipFile(master_zip, 'r') as zip:
zip.extractall("data")
Path("data/MuTual-master/data").rename(str(self.BASE_PATH))
# Remove left over files and directories.
master_zip.unlink()
shutil.rmtree("data/MuTual-master")
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -62,18 +40,11 @@ class MuTualBase(Task): ...@@ -62,18 +40,11 @@ class MuTualBase(Task):
def has_test_docs(self): def has_test_docs(self):
return False return False
def _load_docs(self, path):
for file in sorted(path.iterdir()):
if file.suffix != ".txt":
continue
with open(file, 'r', encoding='utf-8') as f:
yield json.load(f)
def training_docs(self): def training_docs(self):
return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "train") return self.dataset["train"]
def validation_docs(self): def validation_docs(self):
return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "dev") return self.dataset["validation"]
def test_docs(self): def test_docs(self):
return NotImplemented return NotImplemented
...@@ -134,8 +105,8 @@ class MuTualBase(Task): ...@@ -134,8 +105,8 @@ class MuTualBase(Task):
class MuTual(MuTualBase): class MuTual(MuTualBase):
DATASET_NAME = Path("mutual") DATASET_NAME = "mutual"
class MuTualPlus(MuTualBase): class MuTualPlus(MuTualBase):
DATASET_NAME = Path("mutual_plus") DATASET_NAME = "mutual_plus"
...@@ -15,8 +15,7 @@ not even bother with the train set. ...@@ -15,8 +15,7 @@ not even bother with the train set.
Homepage: https://ai.google.com/research/NaturalQuestions Homepage: https://ai.google.com/research/NaturalQuestions
""" """
import random from lm_eval.base import Task
from . common import HFTask
from itertools import islice from itertools import islice
...@@ -30,7 +29,7 @@ _CITATION = """ ...@@ -30,7 +29,7 @@ _CITATION = """
""" """
class NaturalQs(HFTask): class NaturalQs(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = "natural_questions" DATASET_PATH = "natural_questions"
DATASET_NAME = None DATASET_NAME = None
...@@ -47,7 +46,12 @@ class NaturalQs(HFTask): ...@@ -47,7 +46,12 @@ class NaturalQs(HFTask):
def training_docs(self): def training_docs(self):
# Cache training for faster few-shot. # Cache training for faster few-shot.
# Data is too large to fit in memory. # Data is too large to fit in memory.
return self.data["train"] if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
return self.dataset["validation"]
def fewshot_examples(self, k, rnd): def fewshot_examples(self, k, rnd):
# Data is too large to fit in memory. We just sample from the first bit. # Data is too large to fit in memory. We just sample from the first bit.
......
...@@ -15,7 +15,6 @@ based algorithm and a word co-occurrence algorithm. ...@@ -15,7 +15,6 @@ based algorithm and a word co-occurrence algorithm.
Homepage: https://allenai.org/data/open-book-qa Homepage: https://allenai.org/data/open-book-qa
""" """
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
from .common import HFTask
_CITATION = """ _CITATION = """
...@@ -28,7 +27,7 @@ _CITATION = """ ...@@ -28,7 +27,7 @@ _CITATION = """
""" """
class OpenBookQA(HFTask, MultipleChoiceTask): class OpenBookQA(MultipleChoiceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "openbookqa" DATASET_PATH = "openbookqa"
DATASET_NAME = "main" DATASET_NAME = "main"
...@@ -42,7 +41,18 @@ class OpenBookQA(HFTask, MultipleChoiceTask): ...@@ -42,7 +41,18 @@ class OpenBookQA(HFTask, MultipleChoiceTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def _convert_standard(self, doc): def training_docs(self):
if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs
def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])
def test_docs(self):
return map(self._process_doc, self.dataset["test"])
def _process_doc(self, doc):
out_doc = { out_doc = {
"id": doc["id"], "id": doc["id"],
"query": doc["question_stem"], "query": doc["question_stem"],
......
This diff is collapsed.
...@@ -9,10 +9,7 @@ actually learning about the world? ...@@ -9,10 +9,7 @@ actually learning about the world?
Homepage: https://yonatanbisk.com/piqa/ Homepage: https://yonatanbisk.com/piqa/
""" """
import numpy as np from lm_eval.base import MultipleChoiceTask
from lm_eval.base import MultipleChoiceTask, rf
from ..metrics import mean
from . common import HFTask
_CITATION = """ _CITATION = """
...@@ -29,7 +26,7 @@ _CITATION = """ ...@@ -29,7 +26,7 @@ _CITATION = """
""" """
class PiQA(HFTask, MultipleChoiceTask): class PiQA(MultipleChoiceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "piqa" DATASET_PATH = "piqa"
DATASET_NAME = None DATASET_NAME = None
...@@ -43,7 +40,15 @@ class PiQA(HFTask, MultipleChoiceTask): ...@@ -43,7 +40,15 @@ class PiQA(HFTask, MultipleChoiceTask):
def has_test_docs(self): def has_test_docs(self):
return False return False
def _convert_standard(self, doc): def training_docs(self):
if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs
def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])
def _process_doc(self, doc):
out_doc = { out_doc = {
"goal": doc["goal"], "goal": doc["goal"],
"choices": [doc["sol1"], doc["sol2"]], "choices": [doc["sol1"], doc["sol2"]],
......
...@@ -15,7 +15,6 @@ have been trained on data not specifically collected to succeed on PROST." ...@@ -15,7 +15,6 @@ have been trained on data not specifically collected to succeed on PROST."
Homepage: https://github.com/nala-cub/prost Homepage: https://github.com/nala-cub/prost
""" """
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
from . common import HFTask
_CITATION = """ _CITATION = """
...@@ -36,7 +35,7 @@ _CITATION = """ ...@@ -36,7 +35,7 @@ _CITATION = """
""" """
class PROST(HFTask, MultipleChoiceTask): class PROST(MultipleChoiceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "corypaik/prost" DATASET_PATH = "corypaik/prost"
DATASET_NAME = None DATASET_NAME = None
...@@ -50,6 +49,9 @@ class PROST(HFTask, MultipleChoiceTask): ...@@ -50,6 +49,9 @@ class PROST(HFTask, MultipleChoiceTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def test_docs(self):
return map(self._process_doc, self.dataset["test"])
def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None): def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
assert num_fewshot == 0, 'PROST is designed to probe models in a zero-shot fashion only.' assert num_fewshot == 0, 'PROST is designed to probe models in a zero-shot fashion only.'
return super().fewshot_context( return super().fewshot_context(
...@@ -59,7 +61,7 @@ class PROST(HFTask, MultipleChoiceTask): ...@@ -59,7 +61,7 @@ class PROST(HFTask, MultipleChoiceTask):
description=description description=description
) )
def _convert_standard(self, doc): def _process_doc(self, doc):
out_doc = { out_doc = {
"query": f"{doc['context']}\nQuestion: {doc['ex_question']}\nAnswer:", "query": f"{doc['context']}\nQuestion: {doc['ex_question']}\nAnswer:",
"choices": [doc['A'], doc['B'], doc['C'], doc['D']], "choices": [doc['A'], doc['B'], doc['C'], doc['D']],
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment