Unverified Commit b5e86d3f authored by Jonathan Tow's avatar Jonathan Tow Committed by GitHub
Browse files

Merge branch 'master' into wsc273-evaluation

parents c32a13e8 a1a4a32e
...@@ -58,10 +58,10 @@ class LM(abc.ABC): ...@@ -58,10 +58,10 @@ class LM(abc.ABC):
return cls() return cls()
class Dataset(abc.ABC): class Task(abc.ABC):
def __init__(self): def __init__(self):
self.download() self.download()
self._traindocs = None self._training_docs = None
def download(self): def download(self):
"""Downloads the task dataset if necessary""" """Downloads the task dataset if necessary"""
...@@ -71,7 +71,7 @@ class Dataset(abc.ABC): ...@@ -71,7 +71,7 @@ class Dataset(abc.ABC):
def has_training_docs(self): def has_training_docs(self):
"""Whether the task has a training set""" """Whether the task has a training set"""
pass pass
@abc.abstractmethod @abc.abstractmethod
def has_validation_docs(self): def has_validation_docs(self):
"""Whether the task has a validation set""" """Whether the task has a validation set"""
...@@ -84,23 +84,29 @@ class Dataset(abc.ABC): ...@@ -84,23 +84,29 @@ class Dataset(abc.ABC):
def training_docs(self): def training_docs(self):
""" """
:return: Iterable[obj] :return: Iterable[obj]
A iterable of any object, that doc_to_text can handle A iterable of any object, that doc_to_text can handle
""" """
return [] return []
def validation_docs(self): def validation_docs(self):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return [] return []
def test_docs(self): def test_docs(self):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return [] return []
def fewshot_examples(self, k):
if self._traindocs is None:
self._traindocs = list(self.training_docs())
return random.sample(self._traindocs, k) def fewshot_examples(self, k):
if self._training_docs is None:
self._training_docs = list(self.training_docs())
return random.sample(self._training_docs, k)
@abc.abstractmethod @abc.abstractmethod
def doc_to_text(self, doc): def doc_to_text(self, doc):
...@@ -123,7 +129,7 @@ class Dataset(abc.ABC): ...@@ -123,7 +129,7 @@ class Dataset(abc.ABC):
part of the document for `doc`. part of the document for `doc`.
""" """
pass pass
@abc.abstractmethod @abc.abstractmethod
def process_results(self, doc, results): def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a """Take a single document and the LM results and evaluates, returning a
...@@ -161,7 +167,7 @@ class Dataset(abc.ABC): ...@@ -161,7 +167,7 @@ class Dataset(abc.ABC):
def fewshot_context(self, doc, num_fewshot, provide_description): def fewshot_context(self, doc, num_fewshot, provide_description):
raw_description = self.fewshot_description() raw_description = self.fewshot_description()
description = (raw_description + "\n===\n\n") if provide_description and raw_description else "" description = (raw_description + "\n===\n\n") if provide_description and raw_description else ""
if num_fewshot == 0: if num_fewshot == 0:
labeled_examples = "" labeled_examples = ""
else: else:
......
...@@ -2,12 +2,12 @@ import abc ...@@ -2,12 +2,12 @@ import abc
import json import json
import os import os
from collections import namedtuple from collections import namedtuple
from lm_eval.base import Dataset, mean, rf from lm_eval.base import Task, mean, rf
from best_download import download_file from best_download import download_file
ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion']) ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion'])
class Arithmetic(Dataset): class Arithmetic(Task):
directory = 'data/arithmetic/' directory = 'data/arithmetic/'
def __init__(self): def __init__(self):
......
import datasets import datasets
import numpy as np import numpy as np
import random from ..base import Task
from ..base import Dataset
class HFTask(Dataset): class HFTask(Task):
DATASET_PATH = None DATASET_PATH = None
DATASET_NAME = None DATASET_NAME = None
def __init__(self): def __init__(self):
self.data = None
super().__init__() super().__init__()
self._training_docs = None
def download(self): def download(self):
self.data = datasets.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME) self.data = datasets.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)
......
...@@ -2,11 +2,11 @@ ...@@ -2,11 +2,11 @@
import json import json
import random import random
from lm_eval.base import Dataset from lm_eval.base import Task
from ..utils import sh from ..utils import sh
class CoQA(Dataset): class CoQA(Task):
def __init__(self): def __init__(self):
self.download() self.download()
def download(self): def download(self):
......
...@@ -5,9 +5,9 @@ from sklearn.metrics import f1_score, matthews_corrcoef ...@@ -5,9 +5,9 @@ from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno from . common import HFTask, simple_accuracy_metric, yesno
from pathlib import Path from pathlib import Path
from ..base import Dataset from ..base import Task
class DROP(Dataset): class DROP(Task):
DATAFOLDER = Path(__file__).parent / "../../data/drop" DATAFOLDER = Path(__file__).parent / "../../data/drop"
def __init__(self): def __init__(self):
......
from lm_eval.base import Dataset, rf, mean from lm_eval.base import Task, rf, mean
from lm_eval.utils import sh from lm_eval.utils import sh
import json import json
import math import math
from best_download import download_file from best_download import download_file
class LAMBADA(Dataset): class LAMBADA(Task):
def download(self): def download(self):
sh("mkdir -p data/lambada") sh("mkdir -p data/lambada")
download_file( download_file(
......
...@@ -30,10 +30,10 @@ class NaturalQs(HFTask): ...@@ -30,10 +30,10 @@ class NaturalQs(HFTask):
def fewshot_examples(self, k): def fewshot_examples(self, k):
# Data is too large to fit in memory. We just sample from the first bit. # Data is too large to fit in memory. We just sample from the first bit.
if self._traindocs is None: if self._training_docs is None:
self._traindocs = list(islice(self.training_docs(), 0, 100000)) self._training_docs = list(islice(self.training_docs(), 0, 100000))
return random.sample(self._traindocs, k) return random.sample(self._training_docs, k)
def doc_to_text(self, doc): def doc_to_text(self, doc):
return 'Q: ' + doc['question']['text'] + '\n\n' + 'A: ' return 'Q: ' + doc['question']['text'] + '\n\n' + 'A: '
......
import json import json
import random import random
from lm_eval.base import Dataset, rf, mean from lm_eval.base import Task, rf, mean
from ..utils import sh from ..utils import sh
import os import os
class PiQA(Dataset): class PiQA(Task):
def download(self): def download(self):
if not os.path.exists('data/piqa'): if not os.path.exists('data/piqa'):
#TODO: use best_download #TODO: use best_download
......
import json import json
import random import random
import os import os
from lm_eval.base import Dataset from lm_eval.base import Task
from ..utils import sh from ..utils import sh
class QuAC(Dataset): class QuAC(Task):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
......
import json import json
import random import random
import os import os
from lm_eval.base import Dataset, rf, mean from lm_eval.base import Task, rf, mean
from tqdm import auto as tqdm_lib from tqdm import auto as tqdm_lib
from . common import simple_accuracy_metric from . common import simple_accuracy_metric
import numpy as np import numpy as np
from ..utils import sh from ..utils import sh
class SATAnalogies(Dataset): class SATAnalogies(Task):
NEEDS_MANUAL_DL = True NEEDS_MANUAL_DL = True
def __init__(self): def __init__(self):
......
import json import json
import random import random
from lm_eval.base import Dataset from lm_eval.base import Task
from ..utils import sh from ..utils import sh
import csv import csv
class StoryCloze(Dataset): class StoryCloze(Task):
NEEDS_MANUAL_DL = True NEEDS_MANUAL_DL = True
def download(self): def download(self):
......
import os import os
import json import json
import random import random
from lm_eval.base import Dataset, mean, rf from lm_eval.base import Task, mean, rf
from ..utils import sh from ..utils import sh
class TriviaQA(Dataset): class TriviaQA(Task):
def download(self): def download(self):
if not os.path.exists('data/triviaqa'): if not os.path.exists('data/triviaqa'):
sh(""" sh("""
......
import numpy as np import numpy as np
import random import random
from lm_eval.base import rf, mean from lm_eval.base import rf, mean
from . common import HFTask from . common import HFTask
""" """
NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation` NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018). as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.02847 See: https://arxiv.org/abs/1806.02847
""" """
class WinogradSchemaChallenge273(HFTask): class WinogradSchemaChallenge273(HFTask):
DATASET_PATH = "winograd_wsc" DATASET_PATH = "winograd_wsc"
DATASET_NAME = "wsc273" DATASET_NAME = "wsc273"
upper_pronouns = ["A", "An", "The", "She", "He", upper_pronouns = ["A", "An", "The", "She", "He",
"It", "They", "My", "His", "Her", "Their"] "It", "They", "My", "His", "Her", "Their"]
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.data = self.__clean_data() self.data = self.__clean_data()
def __clean_data(self): def __clean_data(self):
# The HF implementation of `wsc273` is not `partial evaluation` friendly. # The HF implementation of `wsc273` is not `partial evaluation` friendly.
data = [] data = []
for doc in self.data["test"]: for doc in self.data["test"]:
doc["text"] = doc["text"].replace(" ", " ") doc["text"] = doc["text"].replace(" ", " ")
doc["options"][0] = self.__normalize_option(doc["options"][0], doc) doc["options"][0] = self.__normalize_option(doc["options"][0], doc)
doc["options"][1] = self.__normalize_option(doc["options"][1], doc) doc["options"][1] = self.__normalize_option(doc["options"][1], doc)
data.append(doc) data.append(doc)
return {"test": data} return {"test": data}
def __normalize_option(self, option, doc): def __normalize_option(self, option, doc):
# Append `'s` to possessive determiner based options. # Append `'s` to possessive determiner based options.
if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]: if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
option += "'s" option += "'s"
# Appropriately lowercase the pronoun in the option. # Appropriately lowercase the pronoun in the option.
pronoun = option.split()[0] pronoun = option.split()[0]
start_of_sentence = doc["text"][doc['pronoun_loc'] - 2] == '.' start_of_sentence = doc["text"][doc['pronoun_loc'] - 2] == '.'
if not start_of_sentence and pronoun in self.upper_pronouns: if not start_of_sentence and pronoun in self.upper_pronouns:
return option.replace(pronoun, pronoun.lower()) return option.replace(pronoun, pronoun.lower())
return option return option
def has_training_docs(self): def has_training_docs(self):
return False return False
def has_validation_docs(self): def has_validation_docs(self):
return False return False
def has_test_docs(self): def has_test_docs(self):
return True return True
def fewshot_examples(self, k): def fewshot_examples(self, k):
# NOTE: `super().fewshot_examples` samples from training docs which are # NOTE: `super().fewshot_examples` samples from training docs which are
# not available for this test-set-only dataset. # not available for this test-set-only dataset.
return random.sample(list(self.test_docs()), k) return random.sample(list(self.test_docs()), k)
def fewshot_description(self): def fewshot_description(self):
# TODO: redo description # TODO: redo description
return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False." return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
@classmethod @classmethod
def partial_context(cls, doc): def partial_context(cls, doc):
# Substitute the pronoun in the original text with each candidate # Substitute the pronoun in the original text with each candidate
# choice and ignore everything after. # choice and ignore everything after.
context1 = doc["text"][:doc["pronoun_loc"]] + doc["options"][0] context1 = doc["text"][:doc["pronoun_loc"]] + doc["options"][0]
context2 = doc["text"][:doc["pronoun_loc"]] + doc["options"][1] context2 = doc["text"][:doc["pronoun_loc"]] + doc["options"][1]
return context1, context2 return context1, context2
@classmethod @classmethod
def partial_target(cls, doc): def partial_target(cls, doc):
# The target is everything after the document specified pronoun. # The target is everything after the document specified pronoun.
start_index = doc["pronoun_loc"] + len(doc["pronoun"]) start_index = doc["pronoun_loc"] + len(doc["pronoun"])
return doc["text"][start_index:].strip() return doc["text"][start_index:].strip()
def doc_to_text(self, doc): def doc_to_text(self, doc):
context1, context2 = self.partial_context(doc) context1, context2 = self.partial_context(doc)
return context1 + '\n' + context2 + '\n' return context1 + '\n' + context2 + '\n'
def doc_to_target(self, doc): def doc_to_target(self, doc):
return self.partial_target(doc) return self.partial_target(doc)
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of """ Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
:param doc: :param doc:
The document as returned from training_docs, validation_docs, or test_docs. The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str :param ctx: str
The context string, generated by fewshot_context. This includes the natural The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question language description, as well as the few shot examples, and the question
part of the document for `doc`. part of the document for `doc`.
""" """
target = self.partial_target(doc) target = self.partial_target(doc)
context1, context2 = self.partial_context(doc) context1, context2 = self.partial_context(doc)
ll_context1, _ = rf.loglikelihood(context1, " " + target) ll_context1, _ = rf.loglikelihood(context1, " " + target)
ll_context2, _ = rf.loglikelihood(context2, " " + target) ll_context2, _ = rf.loglikelihood(context2, " " + target)
return ll_context1, ll_context2 return ll_context1, ll_context2
def process_results(self, doc, results): def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a """Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of dict where keys are the names of submetrics and values are the values of
the metric for that one document the metric for that one document
:param doc: :param doc:
The document as returned from training_docs, validation_docs, or test_docs. The document as returned from training_docs, validation_docs, or test_docs.
:param results: :param results:
The results of the requests created in construct_requests. The results of the requests created in construct_requests.
""" """
return { return {
"acc": np.argmax(results) == doc["label"] "acc": np.argmax(results) == doc["label"]
} }
def aggregation(self): def aggregation(self):
""" """
:returns: {str: [float] -> float} :returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics functions that aggregate a list of metrics
""" """
return { return {
"acc": mean "acc": mean
} }
def higher_is_better(self): def higher_is_better(self):
""" """
:returns: {str: bool} :returns: {str: bool}
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better whether a higher value of the submetric is better
""" """
return { return {
"acc": True "acc": True
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment