Unverified Commit b5e86d3f authored by Jonathan Tow's avatar Jonathan Tow Committed by GitHub
Browse files

Merge branch 'master' into wsc273-evaluation

parents c32a13e8 a1a4a32e
......@@ -58,10 +58,10 @@ class LM(abc.ABC):
return cls()
class Dataset(abc.ABC):
class Task(abc.ABC):
def __init__(self):
self.download()
self._traindocs = None
self._training_docs = None
def download(self):
"""Downloads the task dataset if necessary"""
......@@ -71,7 +71,7 @@ class Dataset(abc.ABC):
def has_training_docs(self):
"""Whether the task has a training set"""
pass
@abc.abstractmethod
def has_validation_docs(self):
"""Whether the task has a validation set"""
......@@ -84,23 +84,29 @@ class Dataset(abc.ABC):
def training_docs(self):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return []
def validation_docs(self):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return []
def test_docs(self):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return []
def fewshot_examples(self, k):
if self._traindocs is None:
self._traindocs = list(self.training_docs())
return random.sample(self._traindocs, k)
def fewshot_examples(self, k):
if self._training_docs is None:
self._training_docs = list(self.training_docs())
return random.sample(self._training_docs, k)
@abc.abstractmethod
def doc_to_text(self, doc):
......@@ -123,7 +129,7 @@ class Dataset(abc.ABC):
part of the document for `doc`.
"""
pass
@abc.abstractmethod
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
......@@ -161,7 +167,7 @@ class Dataset(abc.ABC):
def fewshot_context(self, doc, num_fewshot, provide_description):
raw_description = self.fewshot_description()
description = (raw_description + "\n===\n\n") if provide_description and raw_description else ""
if num_fewshot == 0:
labeled_examples = ""
else:
......
......@@ -2,12 +2,12 @@ import abc
import json
import os
from collections import namedtuple
from lm_eval.base import Dataset, mean, rf
from lm_eval.base import Task, mean, rf
from best_download import download_file
ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion'])
class Arithmetic(Dataset):
class Arithmetic(Task):
directory = 'data/arithmetic/'
def __init__(self):
......
import datasets
import numpy as np
import random
from ..base import Dataset
from ..base import Task
class HFTask(Dataset):
class HFTask(Task):
DATASET_PATH = None
DATASET_NAME = None
def __init__(self):
self.data = None
super().__init__()
self._training_docs = None
def download(self):
self.data = datasets.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)
......
......@@ -2,11 +2,11 @@
import json
import random
from lm_eval.base import Dataset
from lm_eval.base import Task
from ..utils import sh
class CoQA(Dataset):
class CoQA(Task):
def __init__(self):
self.download()
def download(self):
......
......@@ -5,9 +5,9 @@ from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno
from pathlib import Path
from ..base import Dataset
from ..base import Task
class DROP(Dataset):
class DROP(Task):
DATAFOLDER = Path(__file__).parent / "../../data/drop"
def __init__(self):
......
from lm_eval.base import Dataset, rf, mean
from lm_eval.base import Task, rf, mean
from lm_eval.utils import sh
import json
import math
from best_download import download_file
class LAMBADA(Dataset):
class LAMBADA(Task):
def download(self):
sh("mkdir -p data/lambada")
download_file(
......
......@@ -30,10 +30,10 @@ class NaturalQs(HFTask):
def fewshot_examples(self, k):
# Data is too large to fit in memory. We just sample from the first bit.
if self._traindocs is None:
self._traindocs = list(islice(self.training_docs(), 0, 100000))
if self._training_docs is None:
self._training_docs = list(islice(self.training_docs(), 0, 100000))
return random.sample(self._traindocs, k)
return random.sample(self._training_docs, k)
def doc_to_text(self, doc):
return 'Q: ' + doc['question']['text'] + '\n\n' + 'A: '
......
import json
import random
from lm_eval.base import Dataset, rf, mean
from lm_eval.base import Task, rf, mean
from ..utils import sh
import os
class PiQA(Dataset):
class PiQA(Task):
def download(self):
if not os.path.exists('data/piqa'):
#TODO: use best_download
......
import json
import random
import os
from lm_eval.base import Dataset
from lm_eval.base import Task
from ..utils import sh
class QuAC(Dataset):
class QuAC(Task):
def __init__(self):
super().__init__()
......
import json
import random
import os
from lm_eval.base import Dataset, rf, mean
from lm_eval.base import Task, rf, mean
from tqdm import auto as tqdm_lib
from . common import simple_accuracy_metric
import numpy as np
from ..utils import sh
class SATAnalogies(Dataset):
class SATAnalogies(Task):
NEEDS_MANUAL_DL = True
def __init__(self):
......
import json
import random
from lm_eval.base import Dataset
from lm_eval.base import Task
from ..utils import sh
import csv
class StoryCloze(Dataset):
class StoryCloze(Task):
NEEDS_MANUAL_DL = True
def download(self):
......
import os
import json
import random
from lm_eval.base import Dataset, mean, rf
from lm_eval.base import Task, mean, rf
from ..utils import sh
class TriviaQA(Dataset):
class TriviaQA(Task):
def download(self):
if not os.path.exists('data/triviaqa'):
sh("""
......
import numpy as np
import random
from lm_eval.base import rf, mean
from . common import HFTask
"""
NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.02847
"""
class WinogradSchemaChallenge273(HFTask):
DATASET_PATH = "winograd_wsc"
DATASET_NAME = "wsc273"
upper_pronouns = ["A", "An", "The", "She", "He",
"It", "They", "My", "His", "Her", "Their"]
def __init__(self):
super().__init__()
self.data = self.__clean_data()
def __clean_data(self):
# The HF implementation of `wsc273` is not `partial evaluation` friendly.
data = []
for doc in self.data["test"]:
doc["text"] = doc["text"].replace(" ", " ")
doc["options"][0] = self.__normalize_option(doc["options"][0], doc)
doc["options"][1] = self.__normalize_option(doc["options"][1], doc)
data.append(doc)
return {"test": data}
def __normalize_option(self, option, doc):
# Append `'s` to possessive determiner based options.
if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
option += "'s"
# Appropriately lowercase the pronoun in the option.
pronoun = option.split()[0]
start_of_sentence = doc["text"][doc['pronoun_loc'] - 2] == '.'
if not start_of_sentence and pronoun in self.upper_pronouns:
return option.replace(pronoun, pronoun.lower())
return option
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def fewshot_examples(self, k):
# NOTE: `super().fewshot_examples` samples from training docs which are
# not available for this test-set-only dataset.
return random.sample(list(self.test_docs()), k)
def fewshot_description(self):
# TODO: redo description
return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
@classmethod
def partial_context(cls, doc):
# Substitute the pronoun in the original text with each candidate
# choice and ignore everything after.
context1 = doc["text"][:doc["pronoun_loc"]] + doc["options"][0]
context2 = doc["text"][:doc["pronoun_loc"]] + doc["options"][1]
return context1, context2
@classmethod
def partial_target(cls, doc):
# The target is everything after the document specified pronoun.
start_index = doc["pronoun_loc"] + len(doc["pronoun"])
return doc["text"][start_index:].strip()
def doc_to_text(self, doc):
context1, context2 = self.partial_context(doc)
return context1 + '\n' + context2 + '\n'
def doc_to_target(self, doc):
return self.partial_target(doc)
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
target = self.partial_target(doc)
context1, context2 = self.partial_context(doc)
ll_context1, _ = rf.loglikelihood(context1, " " + target)
ll_context2, _ = rf.loglikelihood(context2, " " + target)
return ll_context1, ll_context2
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
return {
"acc": np.argmax(results) == doc["label"]
}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"acc": mean
}
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"acc": True
}
import numpy as np
import random
from lm_eval.base import rf, mean
from . common import HFTask
"""
NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.02847
"""
class WinogradSchemaChallenge273(HFTask):
DATASET_PATH = "winograd_wsc"
DATASET_NAME = "wsc273"
upper_pronouns = ["A", "An", "The", "She", "He",
"It", "They", "My", "His", "Her", "Their"]
def __init__(self):
super().__init__()
self.data = self.__clean_data()
def __clean_data(self):
# The HF implementation of `wsc273` is not `partial evaluation` friendly.
data = []
for doc in self.data["test"]:
doc["text"] = doc["text"].replace(" ", " ")
doc["options"][0] = self.__normalize_option(doc["options"][0], doc)
doc["options"][1] = self.__normalize_option(doc["options"][1], doc)
data.append(doc)
return {"test": data}
def __normalize_option(self, option, doc):
# Append `'s` to possessive determiner based options.
if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
option += "'s"
# Appropriately lowercase the pronoun in the option.
pronoun = option.split()[0]
start_of_sentence = doc["text"][doc['pronoun_loc'] - 2] == '.'
if not start_of_sentence and pronoun in self.upper_pronouns:
return option.replace(pronoun, pronoun.lower())
return option
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def fewshot_examples(self, k):
# NOTE: `super().fewshot_examples` samples from training docs which are
# not available for this test-set-only dataset.
return random.sample(list(self.test_docs()), k)
def fewshot_description(self):
# TODO: redo description
return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
@classmethod
def partial_context(cls, doc):
# Substitute the pronoun in the original text with each candidate
# choice and ignore everything after.
context1 = doc["text"][:doc["pronoun_loc"]] + doc["options"][0]
context2 = doc["text"][:doc["pronoun_loc"]] + doc["options"][1]
return context1, context2
@classmethod
def partial_target(cls, doc):
# The target is everything after the document specified pronoun.
start_index = doc["pronoun_loc"] + len(doc["pronoun"])
return doc["text"][start_index:].strip()
def doc_to_text(self, doc):
context1, context2 = self.partial_context(doc)
return context1 + '\n' + context2 + '\n'
def doc_to_target(self, doc):
return self.partial_target(doc)
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
target = self.partial_target(doc)
context1, context2 = self.partial_context(doc)
ll_context1, _ = rf.loglikelihood(context1, " " + target)
ll_context2, _ = rf.loglikelihood(context2, " " + target)
return ll_context1, ll_context2
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
return {
"acc": np.argmax(results) == doc["label"]
}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"acc": mean
}
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"acc": True
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment