Unverified Commit 4887d9d3 authored by Jonathan Tow's avatar Jonathan Tow Committed by GitHub
Browse files

Merge pull request #307 from Taekyoon/multilingual-ko-korsts

Add klue-sts task to eval Korean language task
parents 7064d6b9 4a1041c1
......@@ -14,13 +14,6 @@ in the broader discourse.
Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
"""
from . import lambada
from lm_eval.base import Task, rf
from lm_eval.metrics import mean, perplexity
from lm_eval.utils import sh
from best_download import download_file
import json
from functools import partial
import os
_CITATION = """
......@@ -35,68 +28,37 @@ _CITATION = """
"""
LANGS = ["en", "fr", "de", "it", "es"]
CHECKSUMS = {"en": "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226",
"fr": "941ec6a73dba7dc91c860bf493eb66a527cd430148827a4753a4535a046bf362",
"de": "51c6c1795894c46e88e4c104b5667f488efe79081fb34d746b82b8caa663865e",
"it": "86654237716702ab74f42855ae5a78455c1b0e50054a4593fb9c6fcf7fad0850",
"es": "ffd760026c647fb43c67ce1bc56fd527937304b348712dce33190ea6caba6f9c"
}
class MultilingualLAMBADA(lambada.LAMBADA):
VERSION = 0
def __init__(self, lang=None):
self.LANG = lang
super().__init__()
def download(self):
sh("mkdir -p data/lambada")
f = f"data/lambada/lambada_test_{self.LANG}.jsonl"
url = f"http://eaidata.bmk.sh/data/lambada_test_{self.LANG}.jsonl"
try:
if not os.path.exists(f):
download_file(
url,
local_file=f,
expected_checksum=CHECKSUMS[self.LANG]
)
except:
# fallback - for some reason best_download doesnt work all the time here
sh(f"wget {url} -O {f}")
sh(f'echo "{CHECKSUMS[self.LANG]} {f}" | sha256sum --check')
def validation_docs(self):
with open(f"data/lambada/lambada_test_{self.LANG}.jsonl") as fh:
for line in fh:
yield json.loads(line)
class MultilingualLAMBADAEN(MultilingualLAMBADA):
def __init__(self):
super().__init__('en')
DATASET_NAME = 'en'
class MultilingualLAMBADAFR(MultilingualLAMBADA):
def __init__(self):
super().__init__('fr')
DATASET_NAME = 'fr'
class MultilingualLAMBADADE(MultilingualLAMBADA):
def __init__(self):
super().__init__('de')
DATASET_NAME = 'de'
class MultilingualLAMBADAIT(MultilingualLAMBADA):
def __init__(self):
super().__init__('it')
DATASET_NAME = 'it'
class MultilingualLAMBADAES(MultilingualLAMBADA):
def __init__(self):
super().__init__('es')
DATASET_NAME = 'es'
LANG_CLASSES = [MultilingualLAMBADAEN, MultilingualLAMBADAFR,
MultilingualLAMBADADE, MultilingualLAMBADAIT,
MultilingualLAMBADAES]
LANG_CLASSES = [MultilingualLAMBADAEN, MultilingualLAMBADAFR, MultilingualLAMBADADE, MultilingualLAMBADAIT, MultilingualLAMBADAES]
def construct_tasks():
tasks = {}
for lang, lang_class in zip(LANGS, LANG_CLASSES):
tasks[f"lambada_mt_{lang}"] = lang_class
for lang_class in LANG_CLASSES:
tasks[f"lambada_mt_{lang_class.DATASET_NAME}"] = lang_class
return tasks
......@@ -10,9 +10,9 @@ NLP setting.
Homepage: https://github.com/lgw863/LogiQA-dataset
"""
import inspect
import lm_eval.datasets.logiqa.logiqa
from lm_eval.base import MultipleChoiceTask
from best_download import download_file
from pathlib import Path
_CITATION = """
......@@ -29,21 +29,8 @@ _CITATION = """
class LogiQA(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = Path("data/logiqa")
def download(self):
if self.DATASET_PATH.exists():
return
Path.mkdir(self.DATASET_PATH, parents=True)
base_url = "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master"
splits = [
{"name": "Train", "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"},
{"name": "Eval", "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"},
{"name": "Test", "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}
]
for split in splits:
file = self.DATASET_PATH / f"{split['name']}.txt"
download_file(f"{base_url}/{split['name']}.txt", local_file=str(file), expected_checksum=split["checksum"])
DATASET_PATH = inspect.getfile(lm_eval.datasets.logiqa.logiqa)
DATASET_NAME = None
def has_training_docs(self):
return True
......@@ -54,7 +41,18 @@ class LogiQA(MultipleChoiceTask):
def has_test_docs(self):
return True
def _convert_standard(self, doc):
def training_docs(self):
if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs
def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])
def test_docs(self):
return map(self._process_doc, self.dataset["test"])
def _process_doc(self, doc):
def format_example(doc, choices):
"""
Passage: <passage>
......@@ -66,7 +64,7 @@ class LogiQA(MultipleChoiceTask):
D. <choice4>
Answer:
"""
prompt = "Passage: " + doc["passage"] + "\n"
prompt = "Passage: " + doc["context"] + "\n"
prompt += "Question: " + doc["question"] + "\nChoices:\n"
for choice, option in zip(choices, doc["options"]):
prompt += f"{choice.upper()}. {option}\n"
......@@ -76,33 +74,8 @@ class LogiQA(MultipleChoiceTask):
return {
"query": format_example(doc, choices),
"choices": doc["options"],
"gold": choices.index(doc["answerKey"])
"gold": choices.index(doc["label"])
}
def _load_docs(self, filename):
def normalize(text):
return text.replace(".", ". ").strip()
with open(filename, 'r') as f:
docs = f.read().strip().split("\n\n")
for rawdoc in docs:
rawdoc = rawdoc.split("\n")
doc = {
"answerKey": rawdoc[0].strip(),
"passage": normalize(rawdoc[1]),
"question": normalize(rawdoc[2]),
"options": [normalize(option[2:]) for option in rawdoc[3:]]
}
yield self._convert_standard(doc)
def training_docs(self):
return self._load_docs(self.DATASET_PATH / "Train.txt")
def validation_docs(self):
return self._load_docs(self.DATASET_PATH / "Eval.txt")
def test_docs(self):
return self._load_docs(self.DATASET_PATH / "Test.txt")
def doc_to_text(self, doc):
return doc["query"]
......@@ -10,7 +10,6 @@ Homepage: https://math-qa.github.io/math-QA/
"""
import re
from lm_eval.base import MultipleChoiceTask
from . common import HFTask
_CITATION = """
......@@ -25,7 +24,7 @@ _CITATION = """
"""
class MathQA(HFTask, MultipleChoiceTask):
class MathQA(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "math_qa"
DATASET_NAME = None
......@@ -39,13 +38,23 @@ class MathQA(HFTask, MultipleChoiceTask):
def has_test_docs(self):
return True
def _convert_standard(self, doc):
def training_docs(self):
if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs
def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])
def test_docs(self):
return map(self._process_doc, self.dataset["test"])
def _process_doc(self, doc):
answer_idx = ['a', 'b', 'c', 'd', 'e'].index(doc['correct'])
choices = [c[4:].rstrip(" ,") for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc['options'])]
out_doc = {
"query": "Question: " + doc['Problem'] +"\nAnswer:",
"query": "Question: " + doc['Problem'] + "\nAnswer:",
"choices": choices,
"gold": answer_idx,
}
......
......@@ -20,9 +20,8 @@ of a question's options. See section 4 of the paper for details.
Homepage: https://leaderboard.allenai.org/mctaco/submissions/public
"""
import numpy as np
from lm_eval.base import rf
from collections import defaultdict
from . common import HFTask
from lm_eval.base import rf, Task
_CITATION = """
......@@ -35,7 +34,7 @@ _CITATION = """
"""
class MCTACO(HFTask):
class MCTACO(Task):
VERSION = 0
DATASET_PATH = "mc_taco"
DATASET_NAME = None
......@@ -49,6 +48,12 @@ class MCTACO(HFTask):
def has_test_docs(self):
return True
def validation_docs(self):
return self.dataset["validation"]
def test_docs(self):
return self.dataset["test"]
def doc_to_text(self, doc):
return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
f"Answer: {doc['answer']}\nPlausible:"
......
......@@ -7,14 +7,11 @@ modified from Chinese high school English listening comprehension test data.
Homepage: https://github.com/Nealcly/MuTual
"""
import json
import zipfile
import shutil
import numpy as np
from pathlib import Path
import inspect
import lm_eval.datasets.mutual.mutual
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
from best_download import download_file
_CITATION = """
......@@ -30,29 +27,10 @@ _CITATION = """
class MuTualBase(Task):
VERSION = 1
BASE_PATH = Path("data/mutual")
DATASET_PATH = inspect.getfile(lm_eval.datasets.mutual.mutual)
DATASET_NAME = None
CHOICES = ['A', 'B', 'C', 'D']
def __init__(self):
super().__init__()
def download(self):
if self.BASE_PATH.exists():
return
Path.mkdir(self.BASE_PATH, parents=True)
master_zip = Path("data/master.zip")
download_file(
"https://github.com/Nealcly/MuTual/archive/master.zip",
local_file=str(master_zip),
expected_checksum="bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9")
with zipfile.ZipFile(master_zip, 'r') as zip:
zip.extractall("data")
Path("data/MuTual-master/data").rename(str(self.BASE_PATH))
# Remove left over files and directories.
master_zip.unlink()
shutil.rmtree("data/MuTual-master")
def has_training_docs(self):
return True
......@@ -62,18 +40,11 @@ class MuTualBase(Task):
def has_test_docs(self):
return False
def _load_docs(self, path):
for file in sorted(path.iterdir()):
if file.suffix != ".txt":
continue
with open(file, 'r', encoding='utf-8') as f:
yield json.load(f)
def training_docs(self):
return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "train")
return self.dataset["train"]
def validation_docs(self):
return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "dev")
return self.dataset["validation"]
def test_docs(self):
return NotImplemented
......@@ -134,8 +105,8 @@ class MuTualBase(Task):
class MuTual(MuTualBase):
DATASET_NAME = Path("mutual")
DATASET_NAME = "mutual"
class MuTualPlus(MuTualBase):
DATASET_NAME = Path("mutual_plus")
DATASET_NAME = "mutual_plus"
......@@ -15,8 +15,7 @@ not even bother with the train set.
Homepage: https://ai.google.com/research/NaturalQuestions
"""
import random
from . common import HFTask
from lm_eval.base import Task
from itertools import islice
......@@ -30,7 +29,7 @@ _CITATION = """
"""
class NaturalQs(HFTask):
class NaturalQs(Task):
VERSION = 0
DATASET_PATH = "natural_questions"
DATASET_NAME = None
......@@ -47,7 +46,12 @@ class NaturalQs(HFTask):
def training_docs(self):
# Cache training for faster few-shot.
# Data is too large to fit in memory.
return self.data["train"]
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
return self.dataset["validation"]
def fewshot_examples(self, k, rnd):
# Data is too large to fit in memory. We just sample from the first bit.
......
......@@ -15,7 +15,6 @@ based algorithm and a word co-occurrence algorithm.
Homepage: https://allenai.org/data/open-book-qa
"""
from lm_eval.base import MultipleChoiceTask
from .common import HFTask
_CITATION = """
......@@ -28,7 +27,7 @@ _CITATION = """
"""
class OpenBookQA(HFTask, MultipleChoiceTask):
class OpenBookQA(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "openbookqa"
DATASET_NAME = "main"
......@@ -42,7 +41,18 @@ class OpenBookQA(HFTask, MultipleChoiceTask):
def has_test_docs(self):
return True
def _convert_standard(self, doc):
def training_docs(self):
if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs
def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])
def test_docs(self):
return map(self._process_doc, self.dataset["test"])
def _process_doc(self, doc):
out_doc = {
"id": doc["id"],
"query": doc["question_stem"],
......
This diff is collapsed.
......@@ -9,10 +9,7 @@ actually learning about the world?
Homepage: https://yonatanbisk.com/piqa/
"""
import numpy as np
from lm_eval.base import MultipleChoiceTask, rf
from ..metrics import mean
from . common import HFTask
from lm_eval.base import MultipleChoiceTask
_CITATION = """
......@@ -29,7 +26,7 @@ _CITATION = """
"""
class PiQA(HFTask, MultipleChoiceTask):
class PiQA(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "piqa"
DATASET_NAME = None
......@@ -43,7 +40,15 @@ class PiQA(HFTask, MultipleChoiceTask):
def has_test_docs(self):
return False
def _convert_standard(self, doc):
def training_docs(self):
if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs
def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])
def _process_doc(self, doc):
out_doc = {
"goal": doc["goal"],
"choices": [doc["sol1"], doc["sol2"]],
......
......@@ -15,7 +15,6 @@ have been trained on data not specifically collected to succeed on PROST."
Homepage: https://github.com/nala-cub/prost
"""
from lm_eval.base import MultipleChoiceTask
from . common import HFTask
_CITATION = """
......@@ -36,7 +35,7 @@ _CITATION = """
"""
class PROST(HFTask, MultipleChoiceTask):
class PROST(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "corypaik/prost"
DATASET_NAME = None
......@@ -50,6 +49,9 @@ class PROST(HFTask, MultipleChoiceTask):
def has_test_docs(self):
return True
def test_docs(self):
return map(self._process_doc, self.dataset["test"])
def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
assert num_fewshot == 0, 'PROST is designed to probe models in a zero-shot fashion only.'
return super().fewshot_context(
......@@ -59,7 +61,7 @@ class PROST(HFTask, MultipleChoiceTask):
description=description
)
def _convert_standard(self, doc):
def _process_doc(self, doc):
out_doc = {
"query": f"{doc['context']}\nQuestion: {doc['ex_question']}\nAnswer:",
"choices": [doc['A'], doc['B'], doc['C'], doc['D']],
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment