Commit d986fd3c authored by Jon Tow's avatar Jon Tow
Browse files

Add proper normalization, checksum, and formatting

parents 805b541c 879aabd6
...@@ -21,7 +21,8 @@ class DummyLM(LM): ...@@ -21,7 +21,8 @@ class DummyLM(LM):
def greedy_until(self, requests): def greedy_until(self, requests):
res = [] res = []
for _ in requests: for ctx, _ in requests:
res.append("lol") res.append("lol")
assert ctx.strip() != ''
return res return res
...@@ -30,6 +30,7 @@ from . import translation ...@@ -30,6 +30,7 @@ from . import translation
from . import headqa from . import headqa
from . import mathqa from . import mathqa
from . import drop from . import drop
from . import unscramble
######################################## ########################################
# Translation tasks # Translation tasks
...@@ -132,6 +133,13 @@ TASK_REGISTRY = { ...@@ -132,6 +133,13 @@ TASK_REGISTRY = {
**translation.create_tasks_from_benchmarks(gpt3_translation_benchmarks), **translation.create_tasks_from_benchmarks(gpt3_translation_benchmarks),
# chef's selection, mostly wmt20 # chef's selection, mostly wmt20
**translation.create_tasks_from_benchmarks(selected_translation_benchmarks), **translation.create_tasks_from_benchmarks(selected_translation_benchmarks),
# Word Scrambling and Manipulation Tasks
"anagrams1": unscramble.Anagrams1,
"anagrams2": unscramble.Anagrams2,
"cycle_letters": unscramble.CycleLetters,
"random_insertion": unscramble.RandomInsertion,
"reversed_words": unscramble.ReversedWords,
} }
......
import json import json
import numpy as np import numpy as np
import re import re
import transformers.data.metrics.squad_metrics as squad_metrics import string
from best_download import download_file from best_download import download_file
from scipy.optimize import linear_sum_assignment from scipy.optimize import linear_sum_assignment
from lm_eval.base import Task, rf from lm_eval.base import Task, rf
...@@ -16,15 +16,18 @@ https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_r ...@@ -16,15 +16,18 @@ https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_r
class DROP(Task): class DROP(Task):
DATAFOLDER = Path("data/drop") DATASET_PATH = Path("data/drop")
URL = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip"
def download(self): def download(self):
if self.DATAFOLDER.exists(): return if self.DATASET_PATH.exists():
Path.mkdir(self.DATAFOLDER) return
download_file(self.URL, to=str(self.DATAFOLDER / "drop_dataset.zip")) Path.mkdir(self.DATASET_PATH)
with ZipFile(self.DATAFOLDER / "drop_dataset.zip", "r") as zip: url = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip"
zip.extractall(self.DATAFOLDER) checksum = "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"
zip_path = self.DATASET_PATH / "drop_dataset.zip"
download_file(url, str(zip_path), checksum)
with ZipFile(zip_path, "r") as zip:
zip.extractall(self.DATASET_PATH)
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -61,16 +64,13 @@ class DROP(Task): ...@@ -61,16 +64,13 @@ class DROP(Task):
answers["date"]["year"]]).strip()] answers["date"]["year"]]).strip()]
def training_docs(self): def training_docs(self):
docs = json.load(open(self.DATAFOLDER / "drop_dataset" / "drop_dataset_train.json")) docs = json.load(open(self.DATASET_PATH / "drop_dataset" / "drop_dataset_train.json"))
return self._load_docs([docs[k] for k in docs.keys()]) return self._load_docs([docs[k] for k in docs.keys()])
def validation_docs(self): def validation_docs(self):
docs = json.load(open(self.DATAFOLDER / "drop_dataset" / "drop_dataset_dev.json")) docs = json.load(open(self.DATASET_PATH / "drop_dataset" / "drop_dataset_dev.json"))
return self._load_docs([docs[k] for k in docs.keys()]) return self._load_docs([docs[k] for k in docs.keys()])
def test_docs(self):
pass
def doc_to_text(self, doc): def doc_to_text(self, doc):
return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:" return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
...@@ -103,44 +103,55 @@ class DROP(Task): ...@@ -103,44 +103,55 @@ class DROP(Task):
:param results: :param results:
The results of the requests created in construct_requests. The results of the requests created in construct_requests.
""" """
golds, preds = doc["answers"], results preds, golds = results, doc["answers"]
exact_match = self._exact_match(golds, preds) exact_match, f1_score = self.get_metrics(preds, golds)
f1_score = self._f1_score(golds, preds)
return { return {
"em": exact_match, "em": exact_match,
"f1": f1_score "f1": f1_score
} }
def _exact_match(self, golds, preds): def get_metrics(self, preds, golds):
""" Returns the exact match of normalized gold answers and predictions. """ exact_match = self._exact_match(preds, golds)
normalized_golds = set([self._normalize(gold) for gold in golds]) f1_score = self._f1_score(preds, golds)
normalized_preds = set([self._normalize(pred) for pred in preds]) return exact_match, f1_score
return int(normalized_golds == normalized_preds)
def _f1_score(self, golds, preds): def _exact_match(self, preds, golds):
"""Returns the average F1-score over normalized gold answers and predictions. """ """ Returns the exact match of normalized gold answers and predictions. """
gold_bags = self._answer_to_bags(golds) normalized_preds = [self._normalize(pred) for pred in preds]
normalized_golds = [self._normalize(gold) for gold in golds]
is_equal_sets = set(normalized_preds) == set(normalized_golds)
is_equal_length = len(normalized_preds) == len(normalized_golds)
return int(is_equal_sets and is_equal_length)
def _f1_score(self, preds, golds):
"""Returns the average F1-score over normalized gold answers and predictions.
From Section 5 of Dua et al. "DROP:...":
"When an answer has multiple spans, we first perform a one-to-one
alignment greedily based on bag-of-word overlap on the set of spans
and then compute average F1 over each span."
"""
pred_bags = self._answer_to_bags(preds) pred_bags = self._answer_to_bags(preds)
f1_per_bag = self._align_bags(gold_bags, pred_bags) gold_bags = self._answer_to_bags(golds)
f1_per_bag = self._align_bags(pred_bags, gold_bags)
return np.mean(f1_per_bag) return np.mean(f1_per_bag)
def _answer_to_bags(self, answers): def _answer_to_bags(self, answers):
return [set(self._normalize(answer).split()) for answer in answers] return [set(self._normalize(answer).split()) for answer in answers]
def _align_bags(self, gold_bags, pred_bags): def _align_bags(self, pred_bags, gold_bags):
""" Returns the max metric value over all the answers. """ """ Returns the max metric value over all the answers. """
scores = np.zeros([len(gold_bags), len(pred_bags)]) scores = np.zeros([len(gold_bags), len(pred_bags)])
for gold_index, gold_bag in enumerate(gold_bags): for gold_index, gold_bag in enumerate(gold_bags):
for pred_index, pred_bag in enumerate(pred_bags): for pred_index, pred_bag in enumerate(pred_bags):
if self._is_number_match(gold_bag, pred_bag): if self._is_number_match(pred_bag, gold_bag):
scores[gold_index, pred_index] = self._bag_f1(gold_bag, pred_bag) scores[gold_index, pred_index] = self._bag_f1(pred_bag, gold_bag)
row_ind, col_ind = linear_sum_assignment(-scores) row_ind, col_ind = linear_sum_assignment(-scores)
max_scores = np.zeros([max(len(gold_bags), len(pred_bags))]) max_scores = np.zeros([max(len(gold_bags), len(pred_bags))])
for row, column in zip(row_ind, col_ind): for row, column in zip(row_ind, col_ind):
max_scores[row] = max(max_scores[row], scores[row, column]) max_scores[row] = max(max_scores[row], scores[row, column])
return max_scores return max_scores
def _bag_f1(self, gold_bag, pred_bag): def _bag_f1(self, pred_bag, gold_bag):
intersection = len(gold_bag.intersection(pred_bag)) intersection = len(gold_bag.intersection(pred_bag))
if intersection == 0: if intersection == 0:
return 0.0 return 0.0
...@@ -149,15 +160,45 @@ class DROP(Task): ...@@ -149,15 +160,45 @@ class DROP(Task):
f1 = (2 * precision * recall) / (precision + recall) f1 = (2 * precision * recall) / (precision + recall)
return f1 return f1
def _is_number_match(self, gold_bag, pred_bag): def _is_number_match(self, pred_bag, gold_bag):
gold_numbers = set(filter(lambda s: s.isnumeric(), list(gold_bag))) pred_numbers = set([word for word in pred_bag if self._is_number(word)])
pred_numbers = set(filter(lambda s: s.isnumeric(), list(pred_bag))) gold_numbers = set([word for word in gold_bag if self._is_number(word)])
return (not gold_numbers) or gold_numbers.intersection(pred_numbers) if (not gold_numbers) or gold_numbers.intersection(pred_numbers):
return True
return False
def _is_number(self, text):
try:
float(text)
return True
except ValueError:
return False
def _normalize(self, answer): def _normalize(self, answer):
def remove_articles(text):
regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
return re.sub(regex, " ", text)
def white_space_fix(text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
if not self._is_number(text):
return "".join(ch for ch in text if ch not in exclude)
else:
return text
def fix_number(text):
return str(float(text)) if self._is_number(text) else text
def tokenize(text): def tokenize(text):
return re.split(" |-", text) return re.split(" |-", text)
tokens = [squad_metrics.normalize_answer(token) for token in tokenize(answer)]
tokens = [
white_space_fix(remove_articles(fix_number(remove_punc(token.lower()))))
for token in tokenize(answer)
]
tokens = [token for token in tokens if token.strip()] tokens = [token for token in tokens if token.strip()]
normalized = " ".join(tokens).strip() normalized = " ".join(tokens).strip()
return normalized return normalized
......
...@@ -145,12 +145,6 @@ class GeneralTranslationTask(Task): ...@@ -145,12 +145,6 @@ class GeneralTranslationTask(Task):
tar_lang = code_to_language(language_codes[1]) tar_lang = code_to_language(language_codes[1])
return f"Translate these {src_lang} phrases to {tar_lang}." return f"Translate these {src_lang} phrases to {tar_lang}."
# TODO This should be something like
# French: {src_line}
# English: {ref_line}
def fewshot_context(self, doc, num_fewshot, provide_description):
return ""
def __str__(self): def __str__(self):
language_codes = self.sacrebleu_language_pair.split("-") language_codes = self.sacrebleu_language_pair.split("-")
src_lang = code_to_language(language_codes[0]) src_lang = code_to_language(language_codes[0])
......
import gzip
import json
import random
import shutil
from pathlib import Path
from best_download import download_file
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
def extract_gzip(gz, to):
with gzip.open(gz, 'rb') as fin:
with open(to, 'wb') as fout:
shutil.copyfileobj(fin, fout)
class WordUnscrambleTask(Task):
BASE_PATH = Path("data/unscramble")
FILENAME = None
CHECKSUM = None # SHA256 Checksum.
def __init__(self):
super().__init__()
def download(self):
if not self.BASE_PATH.exists():
Path.mkdir(self.BASE_PATH)
file = self.BASE_PATH / self.FILENAME
if not file.exists():
rawfile = file.parent / (file.name + ".gz")
base_url = "https://raw.githubusercontent.com/openai/gpt-3/master/data"
download_file(f"{base_url}/{self.FILENAME}.gz", str(rawfile), self.CHECKSUM)
extract_gzip(gz=rawfile, to=file)
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def validation_docs(self):
file = self.BASE_PATH / self.FILENAME
return (json.loads(line) for line in open(file).read().splitlines())
def fewshot_description(self):
return "Please unscramble the letters into a word, and write that word:"
def doc_to_text(self, doc):
return doc["context"]
def doc_to_target(self, doc):
return doc["completion"]
def construct_requests(self, doc, ctx):
completion = rf.greedy_until(ctx, ["\n"])
return completion
def process_results(self, doc, results):
pred = results[0]
gold = doc["completion"]
return {
"acc": int(pred == gold)
}
def aggregation(self):
return {
"acc": mean
}
def higher_is_better(self):
return {
"acc": True
}
class Anagrams1(WordUnscrambleTask):
FILENAME = "mid_word_1_anagrams.jsonl"
CHECKSUM = "6768a86896083199de4815d4964cb2f6f1046476cfd80c2a562784f182905979"
class Anagrams2(WordUnscrambleTask):
FILENAME = "mid_word_2_anagrams.jsonl"
CHECKSUM = "c3d839d09a7954b78a27cd2cd75d4ed0488656c56ef4dbd741a005343826cb01"
class CycleLetters(WordUnscrambleTask):
FILENAME = "cycle_letters_in_word.jsonl"
CHECKSUM = "1689c9002bb8c5988bf5f05e977c9db92f57932c1b5a38998c29ac0dd71e1d42"
class RandomInsertion(WordUnscrambleTask):
FILENAME = "random_insertion_in_word.jsonl"
CHECKSUM = "72e65d83da53d15752ee0c47379509de149ddbad32d61184e5991df29616b78a"
class ReversedWords(WordUnscrambleTask):
FILENAME = "reversed_words.jsonl"
CHECKSUM = "133a08f875cd6c1ef8608a3233571a773881cc27b1c707de738cc6543439332a"
import argparse
import json
import numpy as np
import random
import itertools
import collections
import logging
from lm_eval import models, tasks, evaluator, base
logging.getLogger("openai").setLevel(logging.WARNING)
fewshot_descriptions = [
"foo",
"bar"
]
task = "lambada"
num_fewshot = 0
model = "gpt2"
model_args = ""
limit = None
no_cache = False
class CustomDescTask:
def __init__(self, task, desc):
self.task = task
self.desc = desc
def fewshot_description():
return self.desc
self.task.fewshot_description = fewshot_description
def __getattr__(self, attr):
return getattr(self.task, attr)
def main():
random.seed(42)
np.random.seed(42)
lm = models.get_model(model).create_from_arg_string(model_args)
if limit:
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
if not no_cache:
lm = base.CachingLM(lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_') + '.db')
task_dict = tasks.get_task_dict([task])
for desc in fewshot_descriptions:
custom_task_dict = {k: CustomDescTask(v, desc) for k, v in task_dict.items()}
results = evaluator.evaluate(lm, custom_task_dict, True, num_fewshot, limit)
dumped = json.dumps(results, indent=2)
print('Description:', desc)
print(dumped)
# MAKE TABLE
from pytablewriter import MarkdownTableWriter
writer = MarkdownTableWriter()
writer.headers = ["Task", "Metric", "Value"]
values = []
for k, dic in results.items():
for m, v in dic.items():
values.append([k, m, '%.4f' % v])
k = ""
writer.value_matrix = values
print(writer.dumps())
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment