Unverified Commit 538be6da authored by Charles Foster's avatar Charles Foster Committed by GitHub
Browse files

Merge pull request #7 from cfoster0/greedyuntil

Fork update and long-overdue SQuAD fixes
parents eb4c8407 5be42b4d
from lm_eval.base import MultipleChoiceTask
from best_download import download_file
from pathlib import Path
class LogiQA(MultipleChoiceTask):
DATASET_PATH = Path("data/logiqa")
def download(self):
if self.DATASET_PATH.exists():
return
Path.mkdir(self.DATASET_PATH)
base_url = "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master"
splits = [
{"name": "Train", "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"},
{"name": "Eval", "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"},
{"name": "Test", "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}
]
for split in splits:
file = self.DATASET_PATH / f"{split['name']}.txt"
download_file(f"{base_url}/{split['name']}.txt", str(file), split["checksum"])
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def _convert_standard(self, doc):
return {
"query": "Passage: " + doc["passage"] + "\nQuestion: " + doc["question"] + "\nAnswer:",
"choices": doc["options"],
"gold": ["a", "b", "c", "d"].index(doc["answerKey"])
}
def _load_docs(self, filename):
def normalize(text):
return text.replace(".", ". ").strip()
with open(filename, 'r') as f:
docs = f.read().strip().split("\n\n")
for rawdoc in docs:
rawdoc = rawdoc.split("\n")
doc = {
"answerKey": rawdoc[0].strip(),
"passage": normalize(rawdoc[1]),
"question": normalize(rawdoc[2]),
"options": [normalize(option[2:]) for option in rawdoc[3:]]
}
yield self._convert_standard(doc)
def training_docs(self):
return self._load_docs(self.DATASET_PATH / "Train.txt")
def validation_docs(self):
return self._load_docs(self.DATASET_PATH / "Eval.txt")
def test_docs(self):
return self._load_docs(self.DATASET_PATH / "Test.txt")
def fewshot_description(self):
# TODO: figure out actual description
return ""
def doc_to_text(self, doc):
return doc["query"]
from . common import HFTask
from lm_eval.base import mean, rf, MultipleChoiceTask
import re
class MathQA(HFTask, MultipleChoiceTask):
DATASET_PATH = "math_qa"
DATASET_NAME = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def _convert_standard(self, doc):
answer_idx = ['a', 'b', 'c', 'd', 'e'].index(doc['correct'])
choices = [c[4:].rstrip(" ,") for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc['options'])]
out_doc = {
"query": "Question: " + doc['Problem'] +"\nAnswer:",
"choices": choices,
"gold": answer_idx,
}
return out_doc
def _load_docs(self, docs):
for record in docs:
yield self._convert_standard(record)
def training_docs(self):
docs = super().training_docs()
return self._load_docs(docs)
def validation_docs(self):
docs = super().validation_docs()
return self._load_docs(docs)
def test_docs(self):
docs = super().test_docs()
return self._load_docs(docs)
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc):
return doc["query"]
import numpy as np
from lm_eval.base import rf, mean
from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask
......
......@@ -2,7 +2,8 @@ import numpy as np
import json
import random
from .common import HFTask
from lm_eval.base import rf, mean
from lm_eval.base import rf
from ..metrics import mean
class Pubmed_QA(HFTask):
......
import os
import numpy as np
from best_download import download_file
from lm_eval.base import MultipleChoiceTask, rf
from lm_eval.metrics import mean
import xml.etree.ElementTree as ET
import random
class QA4MRE(MultipleChoiceTask):
YEAR = None
def download(self):
year = self.YEAR
lang = "EN"
base_path = (
"http://nlp.uned.es/clef-qa/repository/js/scripts/downloadFile.php?"
"file=/var/www/html/nlp/clef-qa/repository/resources/QA4MRE/"
)
# TODO: add side tasks?
variable_year_path = {
2011: '2011/Training_Data/Goldstandard/',
2012: '2012/Main_Task/Training_Data/Goldstandard/Used_in_Evaluation/',
2013: '2013/Main_Task/Training_Data/Goldstandard/'
}
sha256sums = {
2011 : "6d2524952a3a015f2a82df785b85b5578681e3602ec276b4e72c01f4ebc50034",
2012 : "f9edaf408f8ac93f89a643a0d0b19263a1bb5ce64f19b2af10df279a656dfb24",
2013 : "c60e5aa4ec77e0493ef0b11d46bd1d74d58a499a3a2f871b8cf3af9536f0f094",
}
vpath = variable_year_path[year]
url_path = f"{base_path}{vpath}QA4MRE-{year}-{lang}_GS.xml"
if not os.path.exists("data/qa4mre"):
os.mkdir("data/qa4mre")
if not os.path.isfile(f"data/qa4mre/QA4MRE-{year}-{lang}"):
download_file(
url_path,
f"data/qa4mre/QA4MRE-{year}-{lang}_GS.xml",
checksum=sha256sums[year],
)
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def fewshot_examples(self, k):
# Since only test docs sample from test docs
if self._training_docs is None:
self._training_docs = list(self.test_docs())
return random.sample(self._training_docs, k)
def _convert_standard(self, question):
choices = [i.text for i in question.iter('answer')]
out_doc = {
"query" : question.find('q_str').text,
"choices": choices,
"gold" : int(question.find("./answer[@correct='Yes']").attrib["a_id"]) - 1,
}
return out_doc
def load_docs(self, textfilename, tfds=False):
tree = ET.parse(textfilename)
root = tree.getroot()
# TODO: context is much larger than the context sometimes
# at the moment, it just gets left-truncated by LM automatically, and maybe that's good enough?
for reading_test in root.iter('reading-test'):
src = reading_test[0].text
src = src.strip().replace("\'", "'")
for qid, question in enumerate(reading_test.iter('q')):
out_doc = self._convert_standard(question)
out_doc['source'] = src
yield out_doc
def fewshot_description(self):
return ""
def test_docs(self):
return self.load_docs(f"data/qa4mre/QA4MRE-{self.YEAR}-EN_GS.xml")
def doc_to_text(self, doc):
return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"])
class QA4MRE_2011(QA4MRE):
YEAR = 2011
class QA4MRE_2012(QA4MRE):
YEAR = 2012
class QA4MRE_2013(QA4MRE):
YEAR = 2013
import collections
import datasets
import numpy as np
from lm_eval.base import rf, mean
from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask
import os
......
import json
import random
import os
from lm_eval.base import MultipleChoiceTask, rf, mean
from lm_eval.base import MultipleChoiceTask, rf
from ..metrics import mean
from tqdm import auto as tqdm_lib
from . common import simple_accuracy_metric
import numpy as np
......
import os
import json
from ..utils import sh
from lm_eval.base import MultipleChoiceTask, rf, mean
from lm_eval.base import MultipleChoiceTask, rf
from ..metrics import mean
import zipfile
from best_download import download_file
class SciQ(MultipleChoiceTask):
......@@ -10,9 +12,11 @@ class SciQ(MultipleChoiceTask):
def download(self):
if not os.path.exists('data/sciq'):
os.mkdir('data/sciq')
sh((
"wget https://ai2-public-datasets.s3.amazonaws.com/sciq/SciQ.zip -O data/sciq/SciQ.zip"
))
download_file(
'https://ai2-public-datasets.s3.amazonaws.com/sciq/SciQ.zip',
'data/sciq/SciQ.zip',
'7f3312f6ac6b09970b32942d106a8c44ec0dad46a0369f17d635aff8e348a87c',
)
with zipfile.ZipFile("data/sciq/SciQ.zip", "r") as zf:
zf.extractall("data/sciq/")
......@@ -48,8 +52,6 @@ class SciQ(MultipleChoiceTask):
yield self._convert_standard(record)
def fewshot_description(self):
# Average ctx length in labelled dataset is 238.9
# 2 few-shot exmamples pushes it beyond context window
return ""
def training_docs(self):
......
import datasets
from lm_eval.base import rf, f1_score, mean
from math import exp
from lm_eval.base import rf
from lm_eval.metrics import f1_score, mean
from . common import HFTask
class SQuAD(HFTask):
......@@ -26,7 +28,7 @@ class SQuAD(HFTask):
return ""
def doc_to_text(self, doc):
return 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A:'
return 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Question: ' + doc['question'] + '\n\n' + 'Answer:'
def doc_to_target(self, doc):
answer_list = doc['answers']['text']
......@@ -48,7 +50,8 @@ class SQuAD(HFTask):
part of the document for `doc`.
"""
continuation = rf.greedy_until(ctx, ['\n'])
return continuation
is_unanswerable = rf.loglikelihood(ctx, [' unanswerable'])
return continuation, is_unanswerable
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
......@@ -62,15 +65,22 @@ class SQuAD(HFTask):
"""
squad_metric = datasets.load_metric("squad_v2")
predictions = {
continuation, is_unanswerable = results
logprob_unanswerable, is_greedy = is_unanswerable
no_answer_probability = exp(logprob_unanswerable)
predictions = [{
'id': doc['id'],
'prediction_text': results[0],
}
'prediction_text': continuation,
'no_answer_probability': no_answer_probability,
}]
references = {
references = [{
'id': doc['id'],
'answers': doc['answers'],
}
}]
metrics = squad_metric.compute(predictions=predictions, references=references)
......
......@@ -5,7 +5,8 @@ To-do:
"""
import numpy as np
from . common import HFTask, yesno
from lm_eval.base import rf, mean, acc_all, metric_max_over_ground_truths
from lm_eval.base import rf
from ..metrics import mean, acc_all, metric_max_over_ground_truths
import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics
from ..utils import general_detokenize
......@@ -218,7 +219,7 @@ class MultiRC(HFTask):
return f"{doc['paragraph']}\nQuestion: {doc['question']}\nAnswer:"
def doc_to_target(self, doc):
return self.format_answer(answer=doc["answer"], label=doc["label"])
return " " + self.format_answer(answer=doc["answer"], label=doc["label"])
@staticmethod
def format_answer(answer, label):
......@@ -271,30 +272,25 @@ class ReCoRD(HFTask):
def training_docs(self):
# In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing.
# Each doc consists of multiple answer candidates, each of which is scored yes/no.
# Hence, we one "doc" for each (context + passage, answer) pair.
# Moreover, we only use the correct answers for context packing
# (This is not an issue for evaluation, where we can directly score multiple candidates at once).
if self._training_docs is None:
self._training_docs = []
for doc in self.data["train"]:
for entity in list(set(doc["entities"])):
self._training_docs.append({
"passage": doc["passage"],
"query": doc["query"],
"entity": entity,
"label": entity in doc["answers"],
})
self._training_docs.append(self._process_doc(doc))
return self._training_docs
def validation_docs(self):
# See: training_docs
for doc in self.data["validation"]:
for entity in list(set(doc["entities"])):
yield {
"passage": doc["passage"],
"query": doc["query"],
"entity": entity,
"label": entity in doc["answers"],
}
yield self._process_doc(doc)
@classmethod
def _process_doc(cls, doc):
return {
"passage": doc["passage"],
"query": doc["query"],
"entities": sorted(list(set(doc["entities"]))),
"answers": sorted(list(set(doc["answers"]))),
}
def doc_to_text(self, doc):
initial_text, *highlights = doc["passage"].strip().split("\n@highlight\n")
......@@ -308,12 +304,13 @@ class ReCoRD(HFTask):
return f' - {query}'.replace("@placeholder", entity)
def doc_to_target(self, doc):
return self.format_answer(query=doc["query"], entity=doc["entity"])
# We only output the first correct entity in a doc
return self.format_answer(query=doc["query"], entity=doc["answers"][0])
def construct_requests(self, doc, ctx):
requests = [
rf.loglikelihood(ctx, self.format_answer(query=doc["query"], entity=entity))
for entity in doc["entity"]
for entity in doc["entities"]
]
return requests
......@@ -322,10 +319,10 @@ class ReCoRD(HFTask):
# - Pick the maximum likelihood prediction entity
# - Evaluate the accuracy and token F1 PER EXAMPLE
# - Average over all examples
max_idx = np.argmax(np.array(results))
max_idx = np.argmax(np.array([result[0] for result in results]))
prediction = doc["entities"][max_idx]
gold_label_set = list(set(doc["answers"]))
gold_label_set = doc["answers"]
f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set)
em = metric_max_over_ground_truths(squad_metrics.compute_exact, prediction, gold_label_set)
......
import abc
import json
import random
import os
from collections import Iterable
from pprint import pprint
import pycountry
from sacrebleu import sacrebleu
import logging
from lm_eval import metrics
from lm_eval.base import Task, rf
"""
This file implements translation tasks using datasets from WMT conferences, provided by sacrebleu.
Traditionally they are evaluated with BLEU scores. TER and CHRF are other options.
See sacrebleu.DATASETS for all available datasets. There are a lot!
"""
sacrebleu_datasets = sacrebleu.DATASETS
def create_tasks_from_benchmarks(benchmark_dict):
"""Creates a dictionary of tasks from a dict
:param benchmark_dict: { dataset: [lang_pair, ...], }
:return: {task_name: task}
e.g. {wmt14-fr-en: Task, wmt16-de-en: Task}
"""
return {
f"{dataset}-{language_pair}": create_translation_task(dataset, language_pair)
for dataset, language_pairs in benchmark_dict.items()
for language_pair in language_pairs
}
########################################
# Tasks
########################################
def create_translation_task(dataset, language_pair):
class TranslationTask(GeneralTranslationTask):
def __init__(self):
super().__init__(dataset, language_pair)
return TranslationTask
class GeneralTranslationTask(Task):
# e.g. ("wmt14", "fr-en")
def __init__(self, sacrebleu_dataset, sacrebleu_language_pair=None):
self.sacrebleu_dataset = sacrebleu_dataset
self.sacrebleu_language_pair = sacrebleu_language_pair
self.src_file = self.ref_file = self.src_data = self.ref_data = None
super().__init__()
def download(self):
# This caches in the users home dir automatically
self.src_file, self.ref_file = \
sacrebleu.download_test_set(self.sacrebleu_dataset, self.sacrebleu_language_pair)
self.src_data, self.ref_data = [
[line.rstrip() for line in sacrebleu.smart_open(file)]
for file in (self.src_file, self.ref_file)
]
def has_training_docs(self):
"""Whether the task has a training set"""
# TODO In the future we could be more discerning. Some more recent tests have train and dev sets
return False
def has_validation_docs(self):
"""Whether the task has a validation set"""
return False
def has_test_docs(self):
"""Whether the task has a test set"""
return True
def test_docs(self):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return [{
"src": src,
"ref": ref
} for src, ref in zip(self.src_data, self.ref_data)]
def doc_to_text(self, doc):
return doc["src"]
def doc_to_target(self, doc):
# This shows a single target, though there may be multiple targets in a lang test
return doc["ref"] if isinstance(doc["ref"], str) else doc["ref"][0]
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
return rf.greedy_until(ctx, ["\n"])
def process_results(self, doc, results):
# These metrics are corpus-level not sentence level, so we'll hide the
# results in this dict and compute the corpus score in the aggregate method
ref_pred = (doc["ref"], results)
return {
"bleu": ref_pred,
"chrf": ref_pred,
"ter": ref_pred,
}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"bleu": metrics.bleu,
"chrf": metrics.chrf,
"ter": metrics.ter,
}
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"bleu": True,
"chrf": True,
"ter": False,
}
def fewshot_description(self):
language_codes = self.sacrebleu_language_pair.split("-")
src_lang = code_to_language(language_codes[0])
tar_lang = code_to_language(language_codes[1])
return f"Translate these {src_lang} phrases to {tar_lang}."
def __str__(self):
language_codes = self.sacrebleu_language_pair.split("-")
src_lang = code_to_language(language_codes[0])
tar_lang = code_to_language(language_codes[1])
return f"{self.sacrebleu_dataset.upper()} {src_lang} to {tar_lang} Task"
########################################
# Util
########################################
def code_to_language(code):
# key is alpha_2 or alpha_3 depending on the code length
language_tuple = pycountry.languages.get(**{f"alpha_{len(code)}": code})
return language_tuple.name
def print_available_tests():
pprint({ts: sacrebleu.get_langpairs_for_testset(ts) for ts in sacrebleu.get_available_testsets()})
def print_available_pairs():
list_of_pairs = [sacrebleu.get_langpairs_for_testset(ts) for ts in sacrebleu.get_available_testsets()]
pairs = set([item for sublist in list_of_pairs for item in sublist])
pairs = sorted(["-".join(map(code_to_language, pair.split("-"))) for pair in pairs])
pprint(pairs)
print(len(pairs))
def main():
# print(sacrebleu.download_test_set("wmt14", "en-fr"))
# print_available_tests()
# sacrebleu.print_test_set("wmt14", "fr-en", "src")
# # Print number of benchmarks
# print(sum([
# len(sacrebleu.get_langpairs_for_testset(ts))
# for ts in sacrebleu.get_available_testsets()
# ]))
# Test task dictionary
# for task, task_class in create_tasks_from_benchmarks(selected_benchmarks).items():
# print(task, task_class())
print_available_pairs()
pass
if __name__ == "__main__":
main()
########################################
# Don't mind me...!
########################################
# Available tests as of 2020/02/11
"""
{'iwslt17': ['en-fr',
'fr-en',
'en-de',
'de-en',
'en-zh',
'zh-en',
'en-ar',
'ar-en',
'en-ja',
'ja-en',
'en-ko',
'ko-en'],
'iwslt17/dev2010': ['en-fr', 'fr-en', 'en-de', 'de-en', 'en-zh', 'zh-en'],
'iwslt17/tst2010': ['en-fr', 'fr-en', 'en-de', 'de-en', 'en-zh', 'zh-en'],
'iwslt17/tst2011': ['en-fr', 'fr-en', 'en-de', 'de-en', 'en-zh', 'zh-en'],
'iwslt17/tst2012': ['en-fr', 'fr-en', 'en-de', 'de-en', 'en-zh', 'zh-en'],
'iwslt17/tst2013': ['en-fr', 'fr-en', 'en-de', 'de-en', 'en-zh', 'zh-en'],
'iwslt17/tst2014': ['en-fr', 'fr-en', 'en-de', 'de-en', 'en-zh', 'zh-en'],
'iwslt17/tst2015': ['en-fr', 'fr-en', 'en-de', 'de-en', 'en-zh', 'zh-en'],
'iwslt17/tst2016': ['en-fr', 'fr-en', 'en-de', 'de-en', 'en-zh', 'zh-en'],
'mtnt1.1/test': ['en-fr', 'fr-en', 'en-ja', 'ja-en'],
'mtnt1.1/train': ['en-fr', 'fr-en', 'en-ja', 'ja-en'],
'mtnt1.1/valid': ['en-fr', 'fr-en', 'en-ja', 'ja-en'],
'mtnt2019': ['en-fr', 'fr-en', 'en-ja', 'ja-en'],
'multi30k/2016': ['en-fr', 'en-de', 'en-cs'],
'multi30k/2017': ['en-fr', 'en-de'],
'multi30k/2018': ['en-fr', 'en-de'],
'wmt08': ['cs-en',
'en-cs',
'de-en',
'en-de',
'es-en',
'en-es',
'fr-en',
'en-fr',
'hu-en',
'en-hu'],
'wmt08/europarl': ['de-en', 'en-de', 'es-en', 'en-es', 'fr-en', 'en-fr'],
'wmt08/nc': ['cs-en', 'en-cs'],
'wmt09': ['cs-en',
'en-cs',
'de-en',
'en-de',
'es-en',
'en-es',
'fr-en',
'en-fr',
'hu-en',
'en-hu',
'it-en',
'en-it'],
'wmt10': ['cs-en',
'en-cs',
'de-en',
'en-de',
'es-en',
'en-es',
'fr-en',
'en-fr'],
'wmt11': ['cs-en',
'en-cs',
'de-en',
'en-de',
'fr-en',
'en-fr',
'es-en',
'en-es'],
'wmt12': ['cs-en',
'en-cs',
'de-en',
'en-de',
'es-en',
'en-es',
'fr-en',
'en-fr'],
'wmt13': ['cs-en',
'en-cs',
'de-en',
'en-de',
'es-en',
'en-es',
'fr-en',
'en-fr',
'ru-en',
'en-ru'],
'wmt14': ['cs-en',
'en-cs',
'de-en',
'en-de',
'en-fr',
'fr-en',
'en-hi',
'hi-en',
'en-ru',
'ru-en'],
'wmt14/full': ['cs-en',
'en-cs',
'de-en',
'en-de',
'en-fr',
'fr-en',
'en-hi',
'hi-en',
'en-ru',
'ru-en'],
'wmt15': ['en-fr',
'fr-en',
'cs-en',
'de-en',
'en-cs',
'en-de',
'en-fi',
'en-ru',
'fi-en',
'ru-en'],
'wmt16': ['cs-en',
'de-en',
'en-cs',
'en-de',
'en-fi',
'en-ro',
'en-ru',
'en-tr',
'fi-en',
'ro-en',
'ru-en',
'tr-en'],
'wmt16/B': ['en-fi'],
'wmt16/dev': ['en-ro', 'en-tr', 'ro-en', 'tr-en'],
'wmt16/tworefs': ['en-fi'],
'wmt17': ['cs-en',
'de-en',
'en-cs',
'en-de',
'en-fi',
'en-lv',
'en-ru',
'en-tr',
'en-zh',
'fi-en',
'lv-en',
'ru-en',
'tr-en',
'zh-en'],
'wmt17/B': ['en-fi'],
'wmt17/dev': ['en-lv', 'en-zh', 'lv-en', 'zh-en'],
'wmt17/improved': ['en-zh', 'zh-en'],
'wmt17/ms': ['zh-en'],
'wmt17/tworefs': ['en-fi'],
'wmt18': ['cs-en',
'de-en',
'en-cs',
'en-de',
'en-et',
'en-fi',
'en-ru',
'et-en',
'fi-en',
'ru-en',
'en-tr',
'tr-en',
'en-zh',
'zh-en'],
'wmt18/dev': ['et-en', 'en-et'],
'wmt18/test-ts': ['cs-en',
'de-en',
'en-cs',
'en-de',
'en-et',
'en-fi',
'en-ru',
'et-en',
'fi-en',
'ru-en',
'en-tr',
'tr-en',
'en-zh',
'zh-en'],
'wmt19': ['cs-de',
'de-cs',
'de-en',
'de-fr',
'en-cs',
'en-de',
'en-fi',
'en-gu',
'en-kk',
'en-lt',
'en-ru',
'en-zh',
'fi-en',
'fr-de',
'gu-en',
'kk-en',
'lt-en',
'ru-en',
'zh-en'],
'wmt19/dev': ['lt-en', 'en-lt', 'gu-en', 'en-gu', 'kk-en', 'en-kk'],
'wmt19/google/ar': ['en-de'],
'wmt19/google/arp': ['en-de'],
'wmt19/google/hqall': ['en-de'],
'wmt19/google/hqp': ['en-de'],
'wmt19/google/hqr': ['en-de'],
'wmt19/google/wmtp': ['en-de'],
'wmt20': ['cs-en',
'de-en',
'de-fr',
'en-cs',
'en-de',
'en-iu',
'en-ja',
'en-km',
'en-pl',
'en-ps',
'en-ru',
'en-ta',
'en-zh',
'fr-de',
'iu-en',
'ja-en',
'km-en',
'pl-en',
'ps-en',
'ru-en',
'ta-en',
'zh-en'],
'wmt20/dev': ['iu-en',
'en-iu',
'ja-en',
'en-ja',
'pl-en',
'en-pl',
'ta-en',
'en-ta'],
'wmt20/robust/set1': ['en-ja', 'en-de'],
'wmt20/robust/set2': ['en-ja', 'ja-en'],
'wmt20/robust/set3': ['de-en'],
'wmt20/tworefs': ['de-en', 'en-de', 'en-zh', 'ru-en', 'zh-en']}
"""
\ No newline at end of file
import os
import json
import random
from lm_eval.base import Task, mean, rf
from lm_eval.base import Task, rf
from ..metrics import mean
from ..utils import sh
class TriviaQA(Task):
......
import gzip
import json
import random
import shutil
from pathlib import Path
from best_download import download_file
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
def extract_gzip(gz, to):
with gzip.open(gz, 'rb') as fin:
with open(to, 'wb') as fout:
shutil.copyfileobj(fin, fout)
class WordUnscrambleTask(Task):
BASE_PATH = Path("data/unscramble")
FILENAME = None
CHECKSUM = None # SHA256 Checksum.
def __init__(self):
super().__init__()
def download(self):
if not self.BASE_PATH.exists():
Path.mkdir(self.BASE_PATH)
file = self.BASE_PATH / self.FILENAME
if not file.exists():
rawfile = file.parent / (file.name + ".gz")
base_url = "https://raw.githubusercontent.com/openai/gpt-3/master/data"
download_file(f"{base_url}/{self.FILENAME}.gz", str(rawfile), self.CHECKSUM)
extract_gzip(gz=rawfile, to=file)
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def validation_docs(self):
file = self.BASE_PATH / self.FILENAME
return (json.loads(line) for line in open(file).read().splitlines())
def fewshot_description(self):
return "Please unscramble the letters into a word, and write that word:"
def doc_to_text(self, doc):
return doc["context"]
def doc_to_target(self, doc):
return doc["completion"]
def construct_requests(self, doc, ctx):
completion = rf.greedy_until(ctx, ["\n"])
return completion
def process_results(self, doc, results):
pred = results[0]
gold = doc["completion"]
return {
"acc": int(pred == gold)
}
def aggregation(self):
return {
"acc": mean
}
def higher_is_better(self):
return {
"acc": True
}
class Anagrams1(WordUnscrambleTask):
FILENAME = "mid_word_1_anagrams.jsonl"
CHECKSUM = "6768a86896083199de4815d4964cb2f6f1046476cfd80c2a562784f182905979"
class Anagrams2(WordUnscrambleTask):
FILENAME = "mid_word_2_anagrams.jsonl"
CHECKSUM = "c3d839d09a7954b78a27cd2cd75d4ed0488656c56ef4dbd741a005343826cb01"
class CycleLetters(WordUnscrambleTask):
FILENAME = "cycle_letters_in_word.jsonl"
CHECKSUM = "1689c9002bb8c5988bf5f05e977c9db92f57932c1b5a38998c29ac0dd71e1d42"
class RandomInsertion(WordUnscrambleTask):
FILENAME = "random_insertion_in_word.jsonl"
CHECKSUM = "72e65d83da53d15752ee0c47379509de149ddbad32d61184e5991df29616b78a"
class ReversedWords(WordUnscrambleTask):
FILENAME = "reversed_words.jsonl"
CHECKSUM = "133a08f875cd6c1ef8608a3233571a773881cc27b1c707de738cc6543439332a"
from . common import HFTask
from lm_eval.base import mean, rf
from lm_eval.base import rf
from ..metrics import mean
class WebQs(HFTask):
DATASET_PATH = "web_questions"
......
import numpy as np
from . common import HFTask
from lm_eval.base import rf, mean
from lm_eval.base import rf
from ..metrics import mean
"""
This evaluation of Winogrande uses partial evaluation as described by
......@@ -13,6 +14,8 @@ class Winogrande(HFTask):
DATASET_PATH = "winogrande"
DATASET_NAME = "winogrande_xl"
answer_to_num = {'1': 0, '2': 1}
def has_training_docs(self):
return True
......@@ -20,54 +23,59 @@ class Winogrande(HFTask):
return True
def has_test_docs(self):
return True
return False
def doc_to_text(self, doc):
return self.partial_context(doc, doc["option" + doc["answer"]])
def fewshot_description(self):
# TODO: redo description
return "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in."
@classmethod
def partial_context(cls, doc):
# Substitute the pronoun in the sentence with each candidate choice
def partial_context(cls, doc, option):
# Substitute the pronoun in the sentence with the specified option
# and ignore everything after.
pronoun_loc = doc["sentence"].index("_")
context1 = doc["sentence"][:pronoun_loc] + doc["option1"]
context2 = doc["sentence"][:pronoun_loc] + doc["option2"]
return context1, context2
return doc["sentence"][:pronoun_loc] + option
def doc_to_target(self, doc):
return self.partial_target(doc)
@classmethod
def partial_target(cls, doc):
# The target is everything after the document specified pronoun.
pronoun_loc = doc["sentence"].index("_") + 1
return doc["sentence"][pronoun_loc:].strip()
def doc_to_text(self, doc):
context1, context2 = self.partial_context(doc)
return context1 + '\n' + context2 + '\n'
def doc_to_target(self, doc):
return self.partial_target(doc)
return " " + doc["sentence"][pronoun_loc:].strip()
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
part of the document for `doc`.
"""
target = self.partial_target(doc)
context1, context2 = self.partial_context(doc)
ll_context1, _ = rf.loglikelihood(context1, " " + target)
ll_context2, _ = rf.loglikelihood(context2, " " + target)
return ll_context1, ll_context2
lls = []
for option in [doc["option1"], doc["option2"]]:
partial_ctx = self.partial_context(doc, option)
full_ctx = self.append_context(ctx, partial_ctx)
lls.append(rf.loglikelihood(full_ctx, target)[0])
return lls
@classmethod
def append_context(cls, ctx, partial_ctx):
ctx = ctx.split("\n\n") # Each fewshot context is on its own new line.
ctx.pop() # Remove the correct context put in by `doc_to_text`.
return "\n\n".join([*ctx, partial_ctx]) if ctx else partial_ctx
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
......@@ -75,15 +83,14 @@ class Winogrande(HFTask):
:param results:
The results of the requests created in construct_requests.
"""
answer = int(doc["answer"]) - 1 # `- 1` b/c doc["answer"] ∈ {'1', '2'}
return {
"acc": np.argmax(results) == answer
"acc": np.argmax(results) == self.answer_to_num[doc["answer"]]
}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
......@@ -93,7 +100,7 @@ class Winogrande(HFTask):
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
......
import numpy as np
import random
from lm_eval.base import rf, mean
from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask
"""
......@@ -26,14 +27,14 @@ class WinogradSchemaChallenge273(HFTask):
data = []
for doc in self.data["test"]:
doc["text"] = doc["text"].replace(" ", " ")
doc["options"][0] = self.__normalize_option(doc["options"][0], doc)
doc["options"][1] = self.__normalize_option(doc["options"][1], doc)
doc["options"][0] = self.__normalize_option(doc, doc["options"][0])
doc["options"][1] = self.__normalize_option(doc, doc["options"][1])
data.append(doc)
return {"test": data}
def __normalize_option(self, option, doc):
def __normalize_option(self, doc, option):
# Append `'s` to possessive determiner based options.
if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
option += "'s"
# Appropriately lowercase the pronoun in the option.
pronoun = option.split()[0]
......@@ -51,56 +52,61 @@ class WinogradSchemaChallenge273(HFTask):
def has_test_docs(self):
return True
def fewshot_description(self):
# TODO: redo description
return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
def fewshot_examples(self, k):
# NOTE: `super().fewshot_examples` samples from training docs which are
# not available for this test-set-only dataset.
return random.sample(list(self.test_docs()), k)
def fewshot_description(self):
# TODO: redo description
return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
def doc_to_text(self, doc):
return self.partial_context(doc, doc["options"][doc["label"]])
@classmethod
def partial_context(cls, doc):
# Substitute the pronoun in the original text with each candidate
# choice and ignore everything after.
context1 = doc["text"][:doc["pronoun_loc"]] + doc["options"][0]
context2 = doc["text"][:doc["pronoun_loc"]] + doc["options"][1]
return context1, context2
def partial_context(cls, doc, option):
# Substitute the pronoun in the original text with the specified
# option and ignore everything after.
return doc["text"][:doc["pronoun_loc"]] + option
def doc_to_target(self, doc):
return self.partial_target(doc)
@classmethod
def partial_target(cls, doc):
# The target is everything after the document specified pronoun.
start_index = doc["pronoun_loc"] + len(doc["pronoun"])
return doc["text"][start_index:].strip()
def doc_to_text(self, doc):
context1, context2 = self.partial_context(doc)
return context1 + '\n' + context2 + '\n'
def doc_to_target(self, doc):
return self.partial_target(doc)
return " " + doc["text"][start_index:].strip()
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
part of the document for `doc`.
"""
target = self.partial_target(doc)
context1, context2 = self.partial_context(doc)
ll_context1, _ = rf.loglikelihood(context1, " " + target)
ll_context2, _ = rf.loglikelihood(context2, " " + target)
return ll_context1, ll_context2
lls = []
for option in doc["options"]:
partial_ctx = self.partial_context(doc, option)
full_ctx = self.append_context(ctx, partial_ctx)
lls.append(rf.loglikelihood(full_ctx, target)[0])
return lls
@classmethod
def append_context(cls, ctx, partial_ctx):
ctx = ctx.split("\n\n") # Each fewshot context is on its own new line.
ctx.pop() # Remove the correct context put in by `doc_to_text`.
return "\n\n".join([*ctx, partial_ctx]) if ctx else partial_ctx
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
......@@ -115,7 +121,7 @@ class WinogradSchemaChallenge273(HFTask):
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
......@@ -125,7 +131,7 @@ class WinogradSchemaChallenge273(HFTask):
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
......
......@@ -20,7 +20,7 @@ def parse_args():
parser.add_argument('--seed', type=int, default=1234)
parser.add_argument('--output_path', default=None)
parser.add_argument('--limit', type=int, default=None)
parser.add_argument('--cache', action="store_true")
parser.add_argument('--no_cache', action="store_true")
return parser.parse_args()
def main():
......@@ -30,8 +30,11 @@ def main():
np.random.seed(args.seed)
lm = models.get_model(args.model).create_from_arg_string(args.model_args)
if args.limit:
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
if args.cache:
if not args.no_cache:
lm = base.CachingLM(lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_') + '.db')
if args.tasks == "all_tasks":
task_names = tasks.ALL_TASKS
......
......@@ -6,4 +6,7 @@ scikit-learn>=0.24.1
torch>=1.7
transformers>=4.1
sqlitedict==1.6.0
pytablewriter
\ No newline at end of file
pytablewriter==0.58.0
sacrebleu==1.5.0
pycountry==20.7.3
numexpr==2.7.2
\ No newline at end of file
janitor.py contains a script to remove benchmark data contamination from training data sets.
It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.14165).
## Algorithm
1) Collects all contamination text files that are to be removed from training data
2) Filters training data by finding `N`gram matches between the training data
and any contamination
1) `N`grams ignore case and punctation and are split on whitespace.
2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around
the match, splitting the training data into chunks
3) Any chunks less than `minimum_slice_length` are removed
4) Training data sets split into more than `too_dirty_cutoff` are considered
completey contaminated and removed
OpenAI used:
```
ngram_n = 13
window_to_remove = 200
minimum_slice_length = 200
too_dirty_cutoff = 10
```
## Compling
Janitor can be used as a pure python program, but it is much faster if the ngram
code is run in C++. To compile the C++ code, run
```
pip install pybind11
c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix)
```
If your your compiler isn't linked to python, you may need to add to the above `-undefined dynamic_lookup`
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <utility>
#include <string>
#include <vector>
#include <tuple>
#include <queue>
bool is_whitespace(char ch) noexcept {
// " \t\n\r\x0b\x0c" (python string.whitespace)
return ch == 32 or (9 <= ch and ch <= 13);
// return ch <= 32; // arguably too general, but slightly faster
}
bool is_punctuation(char c) noexcept {
// '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' ascii values: 33-47, 58-64, 91-96, 123-126
return (33 <= c and c <= 47) or (58 <= c and c <= 64) or (91 <= c and c <= 96) or (123 <= c and c <= 126);
}
// Takes a string and makes ngrams of length N, splitting grams on whitespace and ignoring ignored characters
// Returns a LARGE array of ngrams
std::vector<std::string> clean_ngram(
std::string const & input, std::string const & ignore, size_t ngram_n
) noexcept {
size_t num_grams = 0;
std::vector<std::string> ngram_list;
std::vector<uint8_t> gram_lengths;
std::string current_ngram;
// Max gram length is set to 10 below.
current_ngram.reserve(11*ngram_n);
gram_lengths.reserve(ngram_n);
bool started_gram = false;
gram_lengths.push_back(0);
//for (size_t i=0; i<input.length(); i++) {
// this is slightly faster, and we don't need the index in this one
for (auto iter = input.begin(); iter != input.end(); iter++) {
// If whitespace, end the current ngram and start the next
// alternatively, (perhaps marginally) faster: if (is_whitespace(ch)) { ... }
if (is_whitespace(*iter) || gram_lengths.back() > 10) {
// Skip all whitespace
while (++iter != input.end() && is_whitespace(*iter));
iter--;
if (started_gram){
num_grams += 1;
// Building 1grams is a special case
if (ngram_n == 1){
ngram_list.push_back(current_ngram);
current_ngram = current_ngram.substr(gram_lengths.front());
gram_lengths.back() = 0;
// If there are enough grams to form an ngram, save
} else if (num_grams >= ngram_n){
// Save the current ngram
ngram_list.push_back(current_ngram);
// Start the next ngram by dropping the first gram and its space from the ngram
current_ngram = current_ngram.substr(gram_lengths.front() + 1);
current_ngram += ' ';
// Drop the length of the first gram and prepare to record the length of the new gram
gram_lengths.erase(gram_lengths.begin());
gram_lengths.push_back(0);
// Otherwise, continute building
} else {
current_ngram += ' ';
gram_lengths.push_back(0);
}
started_gram = false;
}
// Skip ignored characters
// alternatively, (perhaps marginally) faster: if (is_punctuation(ch)) continue;
} else if (ignore.find(*iter) != std::string::npos) {
continue;
}
// If it is a non-ignored character, add it to the ngram and update the last gram's length
else {
current_ngram += tolower(*iter);
gram_lengths.back() += 1;
started_gram = true;
}
}
return ngram_list;
}
// Takes a string and makes ngrams of length N, splitting grams on whitespace and ignoring ignored characters
// Returns a LARGE array of tuples of (ngram, start_idx, end_idx)
std::vector<std::tuple<std::string, size_t, size_t> > clean_ngram_with_indices(
std::string const & input, std::string const & ignore, size_t ngram_n
) noexcept {
size_t num_grams = 0;
std::vector<std::tuple<std::string, size_t, size_t> > ngram_list;
std::vector<uint8_t> gram_lengths;
std::vector<size_t> gram_start_indices;
std::string current_ngram;
// Max gram length is set to 10 below.
current_ngram.reserve(11*ngram_n);
bool started_gram = false;
gram_lengths.push_back(0);
gram_start_indices.push_back(0);
for (size_t i=0; i<input.length(); i++) {
char ch = input[i];
// If whitespace, end the current ngram and start the next
if (is_whitespace(ch) || gram_lengths.back() > 10) {
// Skip all whitespace
while (++i < input.length() && is_whitespace(input[i]));
i--;
if (started_gram){
num_grams += 1;
// Building 1grams is a special case
if (ngram_n == 1){
ngram_list.push_back(std::make_tuple(current_ngram, gram_start_indices.front(), i));
current_ngram = current_ngram.substr(gram_lengths.front());
gram_lengths.back() = 0;
gram_start_indices.back() = i+1;
// If there are enough grams to form an ngram, save
} else if (num_grams >= ngram_n){
// Save the current ngram
ngram_list.push_back(
std::make_tuple(current_ngram, gram_start_indices.front(), i)
);
// Start the next ngram by dropping the first gram and its space from the ngram
current_ngram = current_ngram.substr(gram_lengths.front() + 1);
current_ngram += ' ';
// Drop the length of the first gram and prepare to record the length of the new gram
gram_lengths.erase(gram_lengths.begin());
gram_lengths.push_back(0);
gram_start_indices.erase(gram_start_indices.begin());
gram_start_indices.push_back(i+1);
// Otherwise, continute building
} else {
current_ngram += ' ';
gram_lengths.push_back(0);
gram_start_indices.push_back(i+1);
}
started_gram = false;
}
// Skip ignored characters
} else if (ignore.find(*iter) != std::string::npos) {
continue;
// If it is a non-ignored character, add it to the ngram and update the last gram's length
} else {
current_ngram += tolower(ch);
gram_lengths.back() += 1;
started_gram = true;
}
}
return ngram_list;
}
PYBIND11_MODULE(janitor_util, m) {
m.doc() = "pybind11 example plugin"; // optional module docstring
// m.def("add", &add, "A function which adds two numbers"); // example function
m.def("clean_ngram", &clean_ngram, "Create ngrams of words, ignoring some characters");
m.def("clean_ngram_with_indices", &clean_ngram_with_indices, "Create ngrams of words with indices, ignoring some characters");
}
// Example compile
// c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix)
// If python and gcc aren't linked, append to the above: -undefined dynamic_lookup
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment