Unverified Commit 5fe7e2c0 authored by Stella Biderman's avatar Stella Biderman Committed by GitHub
Browse files

Merge branch 'master' into master

parents 14b8ec7e 37afce3b
"""
Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization
https://arxiv.org/pdf/1808.08745.pdf
The dataset is for the task of abstractive summarization in its extreme form, its about summarizing a document in a single sentence. It introduces extreme summarization, a new single-document summarization task which does not favor extractive strategies and calls for an abstractive modeling approach. The idea is to create a short, one-sentence news summary answering the question "What is the article about?".
This particularly uses the dataset that is part of the GEM benchmark
Homepage: https://github.com/EdinburghNLP/XSum
The GEM Benchmark: Natural Language Generation, its Evaluation and Metrics
https://arxiv.org/pdf/2102.01672v3.pdf
Write a Short Description of the task.
Homepage: https://gem-benchmark.com/data_cards/XSum
"""
from lm_eval.base import PromptSourceTask
from lm_eval.base import Task, rf
_CITATION = """
@InProceedings{xsum-emnlp,
author = "Shashi Narayan and Shay B. Cohen and Mirella Lapata",
title = "Don't Give Me the Details, Just the Summary! {T}opic-Aware Convolutional Neural Networks for Extreme Summarization",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing ",
year = "2018",
address = "Brussels, Belgium",
}
"""
class GEMXSUMBase(PromptSourceTask):
VERSION = 0
DATASET_PATH = "GEM/xsum"
DATASET_NAME = None
SPLIT = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
# We cache training documents in `self._training_docs` for faster
# few-shot processing. If the data is too large to fit in memory,
# return the training data as a generator instead of a list.
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
return self.dataset["test"]
def max_generation_length(self):
return 64
class GEMXSUM(GEMXSUMBase):
'''this is for train/validation/test'''
SPLIT = ''
class GEMXSUMChallgeSample(GEMXSUMBase):
'''this is for challenge_train_sample/challenge_validation_sample'''
SPLIT = 'challenge_sample'
def has_test_docs(self):
return False
def training_docs(self):
if self.has_training_docs():
# We cache training documents in `self._training_docs` for faster
# few-shot processing. If the data is too large to fit in memory,
# return the training data as a generator instead of a list.
if self._training_docs is None:
self._training_docs = list(self.dataset["challenge_train_sample"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["challenge_validation_sample"]
class GEMXSUMChallgeTestBacktranslation(GEMXSUMBase):
'''this is for challenge_test_backtranslation'''
SPLIT = 'challenge_test_backtranslation'
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def test_docs(self):
if self.has_test_docs():
return self.dataset[self.SPLIT]
class GEMXSUMChallgeTestBFP02(GEMXSUMBase):
'''this is for challenge_test_bfp_02'''
SPLIT = 'challenge_test_bfp_02'
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def test_docs(self):
if self.has_test_docs():
return self.dataset[self.SPLIT]
class GEMXSUMChallgeTestBFP05(GEMXSUMBase):
'''this is for challenge_test_bfp_05'''
SPLIT = 'challenge_test_bfp_05'
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def test_docs(self):
if self.has_test_docs():
return self.dataset[self.SPLIT]
class GEMXSUMChallgeTestNopunc(GEMXSUMBase):
'''this is for challenge_test_nopunc'''
SPLIT = 'challenge_test_nopunc'
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def test_docs(self):
if self.has_test_docs():
return self.dataset[self.SPLIT]
class GEMXSUMChallgeTestCovid(GEMXSUMBase):
'''this is for challenge_test_covid'''
SPLIT = 'challenge_test_covid'
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def test_docs(self):
if self.has_test_docs():
return self.dataset[self.SPLIT]
\ No newline at end of file
...@@ -14,7 +14,7 @@ respect to a wide range of linguistic phenomena found in natural language. ...@@ -14,7 +14,7 @@ respect to a wide range of linguistic phenomena found in natural language.
Homepage: https://gluebenchmark.com/ Homepage: https://gluebenchmark.com/
""" """
import numpy as np import numpy as np
from lm_eval.base import rf, Task from lm_eval.base import PromptSourceTask, rf, Task
from lm_eval.metrics import mean, matthews_corrcoef, f1_score, yesno from lm_eval.metrics import mean, matthews_corrcoef, f1_score, yesno
from lm_eval.utils import general_detokenize from lm_eval.utils import general_detokenize
...@@ -45,7 +45,7 @@ _CITATION = """ ...@@ -45,7 +45,7 @@ _CITATION = """
# Single-Sentence Tasks # Single-Sentence Tasks
class CoLA(Task): class CoLA(PromptSourceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "cola" DATASET_NAME = "cola"
...@@ -67,37 +67,20 @@ class CoLA(Task): ...@@ -67,37 +67,20 @@ class CoLA(Task):
def validation_docs(self): def validation_docs(self):
return self.dataset["validation"] return self.dataset["validation"]
def doc_to_text(self, doc): # def process_results(self, doc, results):
return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(doc["sentence"]) # answer_choices_list = self.prompt.get_answer_choices_list(doc)
# pred = np.argmax(results)
# target = answer_choices_list.index(self.doc_to_target(doc).strip())
# return {"mcc": (target, pred)}
def doc_to_target(self, doc): # def higher_is_better(self):
return " {}".format({1: "yes", 0: "no"}[doc["label"]]) # return {"mcc": True}
def construct_requests(self, doc, ctx): # def aggregation(self):
ll_true, _ = rf.loglikelihood(ctx, " yes") # return {"mcc": matthews_corrcoef}
ll_false, _ = rf.loglikelihood(ctx, " no")
return ll_true, ll_false
def process_results(self, doc, results):
ll_true, ll_false = results
pred = ll_true > ll_false
gold = doc["label"]
return {
"mcc": (gold, pred)
}
def higher_is_better(self): class SST(PromptSourceTask):
return {
"mcc": True
}
def aggregation(self):
return {
"mcc": matthews_corrcoef
}
class SST(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "sst2" DATASET_NAME = "sst2"
...@@ -119,42 +102,11 @@ class SST(Task): ...@@ -119,42 +102,11 @@ class SST(Task):
def validation_docs(self): def validation_docs(self):
return self.dataset["validation"] return self.dataset["validation"]
def doc_to_text(self, doc):
return "{}\nQuestion: Is this sentence positive or negative?\nAnswer:".format(
general_detokenize(doc["sentence"]),
)
def doc_to_target(self, doc):
return " {}".format({1: "positive", 0: "negative"}[doc["label"]])
def construct_requests(self, doc, ctx):
ll_positive, _ = rf.loglikelihood(ctx, " positive")
ll_negative, _ = rf.loglikelihood(ctx, " negative")
return ll_positive, ll_negative
def process_results(self, doc, results):
ll_positive, ll_negative = results
pred = ll_positive > ll_negative
gold = doc["label"]
return {
"acc": pred == gold
}
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self):
return {
"acc": mean
}
# Inference Tasks # Inference Tasks
class MNLI(Task): class MNLI(PromptSourceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "mnli" DATASET_NAME = "mnli"
...@@ -181,41 +133,6 @@ class MNLI(Task): ...@@ -181,41 +133,6 @@ class MNLI(Task):
if self.has_test_docs(): if self.has_test_docs():
return self.dataset["test_matched"] return self.dataset["test_matched"]
def doc_to_text(self, doc):
return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
doc["premise"],
doc["hypothesis"].strip() + ('' if doc["hypothesis"].strip().endswith('.') else '.'),
)
def doc_to_target(self, doc):
# True = entailment
# False = contradiction
# Neither = neutral
return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
def construct_requests(self, doc, ctx):
ll_true, _ = rf.loglikelihood(ctx, " True")
ll_neither, _ = rf.loglikelihood(ctx, " Neither")
ll_false, _ = rf.loglikelihood(ctx, " False")
return ll_true, ll_neither, ll_false
def process_results(self, doc, results):
gold = doc["label"]
pred = np.argmax(results)
return {
"acc": pred == gold
}
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self):
return {
"acc": mean
}
class MNLIMismatched(MNLI): class MNLIMismatched(MNLI):
VERSION = 0 VERSION = 0
...@@ -229,7 +146,7 @@ class MNLIMismatched(MNLI): ...@@ -229,7 +146,7 @@ class MNLIMismatched(MNLI):
return self.dataset["test_mismatched"] return self.dataset["test_mismatched"]
class QNLI(Task): class QNLI(PromptSourceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "qnli" DATASET_NAME = "qnli"
...@@ -251,42 +168,8 @@ class QNLI(Task): ...@@ -251,42 +168,8 @@ class QNLI(Task):
def validation_docs(self): def validation_docs(self):
return self.dataset["validation"] return self.dataset["validation"]
def doc_to_text(self, doc):
return "{}\n{}\nQuestion: Does this response answer the question?\nAnswer:".format(
doc["question"],
doc["sentence"],
)
def doc_to_target(self, doc):
# True = entailment
# False = not entailment
return " {}".format({0: "yes", 1: "no"}[doc["label"]])
def construct_requests(self, doc, ctx):
ll_yes, _ = rf.loglikelihood(ctx, " yes")
ll_no, _ = rf.loglikelihood(ctx, " no")
return ll_yes, ll_no
def process_results(self, doc, results):
ll_yes, ll_no = results
pred = ll_no > ll_yes
gold = doc["label"]
return {
"acc": pred == gold
}
def higher_is_better(self): class WNLI(PromptSourceTask):
return {
"acc": True
}
def aggregation(self):
return {
"acc": mean
}
class WNLI(Task):
VERSION = 1 VERSION = 1
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "wnli" DATASET_NAME = "wnli"
...@@ -301,49 +184,13 @@ class WNLI(Task): ...@@ -301,49 +184,13 @@ class WNLI(Task):
return False return False
def training_docs(self): def training_docs(self):
if self._training_docs is None: return self.dataset["train"]
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self): def validation_docs(self):
return self.dataset["validation"] return self.dataset["validation"]
def doc_to_text(self, doc):
return "{}\nQuestion: {} True or False?\nAnswer:".format(
doc["sentence1"],
doc["sentence2"],
)
def doc_to_target(self, doc):
# True = entailment
# False = not_entailment
return " {}".format({0: "False", 1: "True"}[doc["label"]])
def construct_requests(self, doc, ctx):
ll_true, _ = rf.loglikelihood(ctx, " True")
ll_false, _ = rf.loglikelihood(ctx, " False")
return ll_true, ll_false
def process_results(self, doc, results):
ll_true, ll_false = results
pred = ll_true > ll_false
gold = doc["label"]
return {
"acc": pred == gold
}
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self):
return {
"acc": mean
}
class RTE(PromptSourceTask):
class RTE(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "rte" DATASET_NAME = "rte"
...@@ -365,45 +212,17 @@ class RTE(Task): ...@@ -365,45 +212,17 @@ class RTE(Task):
def validation_docs(self): def validation_docs(self):
return self.dataset["validation"] return self.dataset["validation"]
def doc_to_text(self, doc):
return "{}\nQuestion: {} True or False?\nAnswer:".format(
doc["sentence1"],
doc["sentence2"],
)
def doc_to_target(self, doc):
# 0 = entailment
# 1 = not_entailment
return " {}".format({0: "True", 1: "False"}[doc["label"]])
def construct_requests(self, doc, ctx):
ll_true, _ = rf.loglikelihood(ctx, " True")
ll_false, _ = rf.loglikelihood(ctx, " False")
return ll_true, ll_false
def process_results(self, doc, results):
ll_true, ll_false = results
pred = ll_false > ll_true
gold = doc["label"]
return {
"acc": pred == gold
}
def higher_is_better(self): def higher_is_better(self):
return { return {"acc": True}
"acc": True
}
def aggregation(self): def aggregation(self):
return { return {"acc": mean}
"acc": mean
}
# Similarity and Paraphrase Tasks # Similarity and Paraphrase Tasks
class MRPC(Task): class MRPC(PromptSourceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "mrpc" DATASET_NAME = "mrpc"
...@@ -425,43 +244,8 @@ class MRPC(Task): ...@@ -425,43 +244,8 @@ class MRPC(Task):
def validation_docs(self): def validation_docs(self):
return self.dataset["validation"] return self.dataset["validation"]
def doc_to_text(self, doc):
return "Sentence 1: {}\nSentence 2: {}\nQuestion: Do both sentences mean the same thing?\nAnswer:".format(
general_detokenize(doc["sentence1"]),
general_detokenize(doc["sentence2"]),
)
def doc_to_target(self, doc):
return " {}".format(yesno(doc["label"]))
def construct_requests(self, doc, ctx):
ll_yes, _ = rf.loglikelihood(ctx, " yes")
ll_no, _ = rf.loglikelihood(ctx, " no")
return ll_yes, ll_no
def process_results(self, doc, results):
ll_yes, ll_no = results
gold = doc["label"]
pred = ll_yes > ll_no
return {
"acc": pred == gold,
"f1": (gold, pred),
}
def higher_is_better(self): class QQP(PromptSourceTask):
return {
"acc": True,
"f1": True
}
def aggregation(self):
return {
"acc": mean,
"f1": f1_score
}
class QQP(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "qqp" DATASET_NAME = "qqp"
...@@ -483,41 +267,6 @@ class QQP(Task): ...@@ -483,41 +267,6 @@ class QQP(Task):
def validation_docs(self): def validation_docs(self):
return self.dataset["validation"] return self.dataset["validation"]
def doc_to_text(self, doc):
return "Question 1: {}\nQuestion 2: {}\nQuestion: Do both questions ask the same thing?\nAnswer:".format(
doc["question1"],
doc["question2"],
)
def doc_to_target(self, doc):
return " {}".format(yesno(doc["label"]))
def construct_requests(self, doc, ctx):
ll_yes, _ = rf.loglikelihood(ctx, " yes")
ll_no, _ = rf.loglikelihood(ctx, " no")
return ll_yes, ll_no
def process_results(self, doc, results):
ll_yes, ll_no = results
gold = doc["label"]
pred = ll_yes > ll_no
return {
"acc": pred == gold,
"f1": (gold, pred),
}
def higher_is_better(self):
return {
"acc": True,
"f1": True
}
def aggregation(self):
return {
"acc": mean,
"f1": f1_score
}
class STSB(Task): class STSB(Task):
VERSION = 0 VERSION = 0
...@@ -554,7 +303,7 @@ class STSB(Task): ...@@ -554,7 +303,7 @@ class STSB(Task):
return " {}".format(doc["label"]) return " {}".format(doc["label"])
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
:param doc: :param doc:
...@@ -565,7 +314,7 @@ class STSB(Task): ...@@ -565,7 +314,7 @@ class STSB(Task):
part of the document for `doc`. part of the document for `doc`.
""" """
# TODO: implement evaluation. # TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented') raise NotImplementedError("Evaluation not implemented")
def process_results(self, doc, results): def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a """Take a single document and the LM results and evaluates, returning a
...@@ -578,7 +327,7 @@ class STSB(Task): ...@@ -578,7 +327,7 @@ class STSB(Task):
The results of the requests created in construct_requests. The results of the requests created in construct_requests.
""" """
# TODO: implement evaluation. # TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented') raise NotImplementedError("Evaluation not implemented")
def aggregation(self): def aggregation(self):
""" """
...@@ -587,7 +336,7 @@ class STSB(Task): ...@@ -587,7 +336,7 @@ class STSB(Task):
functions that aggregate a list of metrics functions that aggregate a list of metrics
""" """
# TODO: implement evaluation. # TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented') raise NotImplementedError("Evaluation not implemented")
def higher_is_better(self): def higher_is_better(self):
""" """
...@@ -596,4 +345,4 @@ class STSB(Task): ...@@ -596,4 +345,4 @@ class STSB(Task):
whether a higher value of the submetric is better whether a higher value of the submetric is better
""" """
# TODO: implement evaluation. # TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented') raise NotImplementedError("Evaluation not implemented")
"""
Right for the Wrong Reasons: Diagnosing Syntactic Heuristics in Natural Language Inference
https://arxiv.org/abs/1902.01007
A controlled evaluation set called HANS (Heuristic Analysis for NLI Systems),
which contains many examples where the heuristics fail.
Homepage: https://github.com/tommccoy1/hans
"""
from lm_eval.base import PromptSourceTask
_CITATION = """
@inproceedings{mccoy-etal-2019-right,
title = "Right for the Wrong Reasons: Diagnosing Syntactic Heuristics in Natural Language Inference",
author = "McCoy, Tom and
Pavlick, Ellie and
Linzen, Tal",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P19-1334",
doi = "10.18653/v1/P19-1334",
pages = "3428--3448",
abstract = "A machine learning system can score well on a given test set by relying on heuristics that are effective for frequent example types but break down in more challenging cases. We study this issue within natural language inference (NLI), the task of determining whether one sentence entails another. We hypothesize that statistical NLI models may adopt three fallible syntactic heuristics: the lexical overlap heuristic, the subsequence heuristic, and the constituent heuristic. To determine whether models have adopted these heuristics, we introduce a controlled evaluation set called HANS (Heuristic Analysis for NLI Systems), which contains many examples where the heuristics fail. We find that models trained on MNLI, including BERT, a state-of-the-art model, perform very poorly on HANS, suggesting that they have indeed adopted these heuristics. We conclude that there is substantial room for improvement in NLI systems, and that the HANS dataset can motivate and measure progress in this area.",
}
"""
class HANS(PromptSourceTask):
VERSION = 0
DATASET_PATH = "hans"
DATASET_NAME = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def training_docs(self):
if self.has_training_docs():
# We cache training documents in `self._training_docs` for faster
# few-shot processing. If the data is too large to fit in memory,
# return the training data as a generator instead of a list.
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
return self.dataset["test"]
"""
https://arxiv.org/abs/1909.01066
https://arxiv.org/abs/2005.04611
LAMA is a prob dataset to test the factual and commonsense knowledge in language models. The dataset includes a subset of
Google_RE (https://code.google.com/archive/p/relation-extraction-corpus/), TRex (subset of wikidata triples),
Conceptnet (https://github.com/commonsense/conceptnet5/wiki) and Squad.
Homepage: https://github.com/facebookresearch/LAMA
"""
from lm_eval.base import PromptSourceTask
import numpy as np
from lm_eval.metrics import mean
from typing import Optional
_CITATION = """
@inproceedings{petroni2019language, title={Language Models as Knowledge Bases?},
author={F. Petroni, T. Rockt{"{a}}schel, A. H. Miller, P. Lewis, A. Bakhtin, Y. Wu and S. Riedel},
booktitle={In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2019}, year={2019} }
@inproceedings{petroni2020how,
title={How Context Affects Language Models' Factual Predictions},
author={Fabio Petroni and Patrick Lewis and Aleksandra Piktus and Tim Rockt{"a}schel and Yuxiang Wu and Alexander H. Miller and Sebastian Riedel},
booktitle={Automated Knowledge Base Construction}, year={2020}, url={https://openreview.net/forum?id=025X0zPfn} }
"""
class BigScienceLAMA(PromptSourceTask):
VERSION = 0
DATASET_PATH = "janck/bigscience-lama"
DATASET_NAME = None
def has_training_docs(self):
# TODO: Fill in the return with `True` if the Task has training data; else `False`.
return False
def has_validation_docs(self):
# TODO: Fill in the return with `True` if the Task has validation data; else `False`.
return False
def has_test_docs(self):
# TODO: Fill in the return with `True` if the Task has test data; else `False`.
return True
def training_docs(self):
if self.has_training_docs():
return self.dataset["train"]
class Trex(PromptSourceTask):
VERSION = 0
DATASET_PATH = "lama"
DATASET_NAME = "trex"
def has_training_docs(self):
# TODO: Fill in the return with `True` if the Task has training data; else `False`.
return False
def has_validation_docs(self):
# TODO: Fill in the return with `True` if the Task has validation data; else `False`.
return False
def has_test_docs(self):
# TODO: Fill in the return with `True` if the Task has test data; else `False`.
return True
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
return self.dataset["train"]
def process_results(self, doc, results):
out = {}
#gold = doc
pred = results[0].strip()
target = self.doc_to_target(doc)['obj_label']
#pred = np.argmax(results)
out["acc"] = pred == target
if self.save_examples:
example = {
"pred": pred,
"target": target,
}
return out, example
return out
def higher_is_better(self):
return {"acc": True}
def aggregation(self):
return {"acc": mean}
def doc_to_target(self, doc):
return doc
class google_re(PromptSourceTask):
VERSION = 0
DATASET_PATH = "lama"
DATASET_NAME = "google_re"
def has_training_docs(self):
# TODO: Fill in the return with `True` if the Task has training data; else `False`.
return False
def has_validation_docs(self):
# TODO: Fill in the return with `True` if the Task has validation data; else `False`.
return False
def has_test_docs(self):
# TODO: Fill in the return with `True` if the Task has test data; else `False`.
return True
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
return self.dataset["train"]
def process_results(self, doc, results):
out = {}
pred = results[0].strip()
target = self.doc_to_target(doc)['obj_label']
out["acc"] = pred == target
if self.save_examples:
example = {
"pred": pred,
"target": target,
}
return out, example
return out
def higher_is_better(self):
return {"acc": True}
def aggregation(self):
return {"acc": mean}
def doc_to_target(self, doc):
return doc
class Conceptnet(PromptSourceTask):
VERSION = 0
DATASET_PATH = "lama"
DATASET_NAME = "conceptnet"
def has_training_docs(self):
# TODO: Fill in the return with `True` if the Task has training data; else `False`.
return False
def has_validation_docs(self):
# TODO: Fill in the return with `True` if the Task has validation data; else `False`.
return False
def has_test_docs(self):
# TODO: Fill in the return with `True` if the Task has test data; else `False`.
return True
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
return self.dataset["train"]
def process_results(self, doc, results):
out = {}
pred = results[0].strip()
target = self.doc_to_target(doc)['obj_label']
out["acc"] = pred == target
if self.save_examples:
example = {
"pred": pred,
"target": target,
}
return out, example
return out
def higher_is_better(self):
return {"acc": True}
def aggregation(self):
return {"acc": mean}
def doc_to_target(self, doc):
return doc
class Squad(PromptSourceTask):
VERSION = 0
DATASET_PATH = "lama"
DATASET_NAME = "squad"
def has_training_docs(self):
# TODO: Fill in the return with `True` if the Task has training data; else `False`.
return False
def has_validation_docs(self):
# TODO: Fill in the return with `True` if the Task has validation data; else `False`.
return False
def has_test_docs(self):
# TODO: Fill in the return with `True` if the Task has test data; else `False`.
return True
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
self._test_docs = list(self.dataset["train"])
return self._test_docs
def process_results(self, doc, results):
out = {}
pred = results[0].strip()
target = self.doc_to_target(doc)['obj_label']
#pred = np.argmax(results)
out["acc"] = pred == target
if self.save_examples:
example = {
"pred": pred,
"target": target,
}
return out, example
return out
def higher_is_better(self):
return {"acc": True}
def aggregation(self):
return {"acc": mean}
def doc_to_target(self, doc):
return doc
def max_generation_length(self) -> Optional[int]:
"""Denote where the max length of the generation if it is obvious from the task."""
return 5
...@@ -12,7 +12,7 @@ Homepage: https://www.cs.cmu.edu/~glai1/data/race/ ...@@ -12,7 +12,7 @@ Homepage: https://www.cs.cmu.edu/~glai1/data/race/
import collections import collections
import datasets import datasets
import numpy as np import numpy as np
from lm_eval.base import rf, Task from lm_eval.base import PromptSourceTask, rf
from lm_eval.metrics import mean from lm_eval.metrics import mean
...@@ -34,13 +34,13 @@ class each: ...@@ -34,13 +34,13 @@ class each:
return list(map(self.f, other)) return list(map(self.f, other))
class RACE(Task): class RACE(PromptSourceTask):
VERSION = 1 VERSION = 1
DATASET_PATH = "race" DATASET_PATH = "race"
DATASET_NAME = "high" DATASET_NAME = "high"
cache = {} cache = {}
letter_to_num = {'A': 0, 'B': 1, 'C': 2, 'D': 3} letter_to_num = {"A": 0, "B": 1, "C": 2, "D": 3}
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -51,79 +51,88 @@ class RACE(Task): ...@@ -51,79 +51,88 @@ class RACE(Task):
def has_test_docs(self): def has_test_docs(self):
return True return True
def _collate_data(self, set): # def _collate_data(self, set):
if set in self.cache: # if set in self.cache:
return self.cache[set] # return self.cache[set]
# One big issue with HF's implementation of this dataset: it makes a # # One big issue with HF's implementation of this dataset: it makes a
# separate document for each question; meanwhile, in the GPT3 paper it # # separate document for each question; meanwhile, in the GPT3 paper it
# is shown that one document is made per passage. # # is shown that one document is made per passage.
r = collections.defaultdict(list) # r = collections.defaultdict(list)
for item in datasets.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)[set]: # for item in datasets.load_dataset(
r[item['article']].append(item) # path=self.DATASET_PATH, name=self.DATASET_NAME
# )[set]:
res = list(r.values() >> each(lambda x: { # r[item["article"]].append(item)
'article': x[0]['article'],
'problems': x >> each(lambda y: { # res = list(
'question': y['question'], # r.values()
'answer': y['answer'], # >> each(
'options': y['options'], # lambda x: {
}) # "article": x[0]["article"],
})) # "problems": x
# >> each(
self.cache[set] = res # lambda y: {
return res # "question": y["question"],
# "answer": y["answer"],
# "options": y["options"],
# }
# ),
# }
# )
# )
# self.cache[set] = res
# return res
def training_docs(self): def training_docs(self):
return self._collate_data("train") return self.dataset["train"]
def validation_docs(self): def validation_docs(self):
return self._collate_data("validation") return self.dataset["validation"]
def test_docs(self): def test_docs(self):
return self._collate_data("test") return self.dataset["test"]
@classmethod @classmethod
def get_answer_option(cls, problem): def get_answer_option(cls, problem):
answer = cls.letter_to_num[problem['answer']] answer = cls.letter_to_num[problem["answer"]]
return problem['options'][answer] return problem["options"][answer]
@classmethod @classmethod
def last_problem(cls, doc): def last_problem(cls, doc):
return doc['problems'][-1] return doc["problems"][-1]
def doc_to_text(self, doc): # def doc_to_text(self, doc):
text = 'Article: ' + doc['article'] + '\n\n' # text = 'Article: ' + doc['article'] + '\n\n'
for problem in doc['problems'][:-1]: # for problem in doc['problems'][:-1]:
if problem['question'][-6:] == ' _ .': # if problem['question'][-6:] == ' _ .':
text += problem['question'][-5:] + self.get_answer_option(problem) + '\n' # text += problem['question'][-5:] + self.get_answer_option(problem) + '\n'
else: # else:
question = 'Question: ' + problem['question'] + '\n' # question = 'Question: ' + problem['question'] + '\n'
answer = 'Answer: ' + self.get_answer_option(problem) + '\n' # answer = 'Answer: ' + self.get_answer_option(problem) + '\n'
text += question + answer # text += question + answer
text += self.last_problem(doc)['question'] # text += self.last_problem(doc)['question']
return text # return text
def doc_to_target(self, doc): # def doc_to_target(self, doc):
return " " + self.get_answer_option(self.last_problem(doc)) # return " " + self.get_answer_option(self.last_problem(doc))
def construct_requests(self, doc, ctx): # def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of # """Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. # Requests which will be sent to the LM.
:param doc: # :param doc:
The document as returned from training_docs, validation_docs, or test_docs. # The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str # :param ctx: str
The context string, generated by fewshot_context. This includes the natural # The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question # language description, as well as the few shot examples, and the question
part of the document for `doc`. # part of the document for `doc`.
""" # """
problem = self.last_problem(doc) # problem = self.last_problem(doc)
ll_choices = [ # ll_choices = [
rf.loglikelihood(ctx, " " + problem['options'][i])[0] # rf.loglikelihood(ctx, " " + problem["options"][i])[0] for i in range(4)
for i in range(4) # ]
] # return ll_choices
return ll_choices
def process_results(self, doc, results): def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a """Take a single document and the LM results and evaluates, returning a
...@@ -135,11 +144,11 @@ class RACE(Task): ...@@ -135,11 +144,11 @@ class RACE(Task):
:param results: :param results:
The results of the requests created in construct_requests. The results of the requests created in construct_requests.
""" """
gold = self.letter_to_num[self.last_problem(doc)['answer']] #
gold = self.letter_to_num[self.doc_to_target(doc)]
# gold = self.letter_to_num[self.last_problem(doc)["answer"]]
pred = np.argmax(results) pred = np.argmax(results)
return { return {"acc": int(pred == gold)}
"acc": int(pred == gold)
}
def aggregation(self): def aggregation(self):
""" """
...@@ -147,9 +156,7 @@ class RACE(Task): ...@@ -147,9 +156,7 @@ class RACE(Task):
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics functions that aggregate a list of metrics
""" """
return { return {"acc": mean}
"acc": mean
}
def higher_is_better(self): def higher_is_better(self):
""" """
...@@ -157,6 +164,4 @@ class RACE(Task): ...@@ -157,6 +164,4 @@ class RACE(Task):
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better whether a higher value of the submetric is better
""" """
return { return {"acc": True}
"acc": True
}
...@@ -12,7 +12,7 @@ TODO: WSC requires free-form generation. ...@@ -12,7 +12,7 @@ TODO: WSC requires free-form generation.
import numpy as np import numpy as np
import sklearn import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics import transformers.data.metrics.squad_metrics as squad_metrics
from lm_eval.base import rf, Task from lm_eval.base import rf, PromptSourceTask
from lm_eval.metrics import mean, acc_all, metric_max_over_ground_truths, yesno from lm_eval.metrics import mean, acc_all, metric_max_over_ground_truths, yesno
from lm_eval.utils import general_detokenize from lm_eval.utils import general_detokenize
...@@ -32,7 +32,7 @@ _CITATION = """ ...@@ -32,7 +32,7 @@ _CITATION = """
""" """
class BoolQ(Task): class BoolQ(PromptSourceTask):
VERSION = 1 VERSION = 1
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
DATASET_NAME = "boolq" DATASET_NAME = "boolq"
...@@ -54,41 +54,8 @@ class BoolQ(Task): ...@@ -54,41 +54,8 @@ class BoolQ(Task):
def validation_docs(self): def validation_docs(self):
return self.dataset["validation"] return self.dataset["validation"]
def doc_to_text(self, doc):
return f"{doc['passage']}\nQuestion: {doc['question']}?\nAnswer:"
def doc_to_target(self, doc): class CommitmentBank(PromptSourceTask):
return " " + yesno(doc['label'])
def construct_requests(self, doc, ctx):
ll_yes, _ = rf.loglikelihood(ctx, ' yes')
ll_no, _ = rf.loglikelihood(ctx, ' no')
return ll_yes, ll_no
def process_results(self, doc, results):
ll_yes, ll_no = results
gold = doc["label"]
acc = 1. if (ll_yes > ll_no) == gold else 0.
return {
"acc": acc
}
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self):
return {
"acc": mean
}
class CommitmentBank(Task):
VERSION = 1 VERSION = 1
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
DATASET_NAME = "cb" DATASET_NAME = "cb"
...@@ -110,40 +77,15 @@ class CommitmentBank(Task): ...@@ -110,40 +77,15 @@ class CommitmentBank(Task):
def validation_docs(self): def validation_docs(self):
return self.dataset["validation"] return self.dataset["validation"]
def doc_to_text(self, doc):
return "{}\nQuestion: {}. True, False or Neither?\nAnswer:".format(
doc["premise"],
doc["hypothesis"],
)
def doc_to_target(self, doc):
# True = entailment
# False = contradiction
# Neither = neutral
return " {}".format({0: "True", 1: "False", 2: "Neither"}[doc["label"]])
def construct_requests(self, doc, ctx):
ll_true, _ = rf.loglikelihood(ctx, ' True')
ll_false, _ = rf.loglikelihood(ctx, ' False')
ll_neither, _ = rf.loglikelihood(ctx, ' Neither')
return ll_true, ll_false, ll_neither
def process_results(self, doc, results): def process_results(self, doc, results):
gold = doc["label"] gold = doc["label"]
pred = np.argmax(results) pred = np.argmax(results)
acc = 1. if pred == gold else 0. acc = 1.0 if pred == gold else 0.0
return { return {"acc": acc, "f1": (pred, gold)}
"acc": acc,
"f1": (pred, gold)
}
def higher_is_better(self): def higher_is_better(self):
return { return {"acc": True, "f1": True}
"acc": True,
"f1": True
}
@classmethod @classmethod
def cb_multi_fi(cls, items): def cb_multi_fi(cls, items):
...@@ -163,7 +105,7 @@ class CommitmentBank(Task): ...@@ -163,7 +105,7 @@ class CommitmentBank(Task):
} }
class Copa(Task): class Copa(PromptSourceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
DATASET_NAME = "copa" DATASET_NAME = "copa"
...@@ -185,53 +127,25 @@ class Copa(Task): ...@@ -185,53 +127,25 @@ class Copa(Task):
def validation_docs(self): def validation_docs(self):
return self.dataset["validation"] return self.dataset["validation"]
def doc_to_text(self, doc):
# Drop the period
connector = {
"cause": "because",
"effect": "therefore",
}[doc["question"]]
return doc["premise"].strip()[:-1] + f" {connector}"
def doc_to_target(self, doc):
correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
# Connect the sentences
return " " + self.convert_choice(correct_choice)
def construct_requests(self, doc, ctx):
choice1 = " " + self.convert_choice(doc["choice1"])
choice2 = " " + self.convert_choice(doc["choice2"])
ll_choice1, _ = rf.loglikelihood(ctx, choice1)
ll_choice2, _ = rf.loglikelihood(ctx, choice2)
return ll_choice1, ll_choice2
def process_results(self, doc, results): def process_results(self, doc, results):
gold = doc["label"] gold = doc["label"]
pred = np.argmax(results) pred = np.argmax(results)
acc = 1. if pred == gold else 0. acc = 1.0 if pred == gold else 0.0
return { return {"acc": acc}
"acc": acc
}
def higher_is_better(self): def higher_is_better(self):
return { return {"acc": True}
"acc": True
}
def aggregation(self): def aggregation(self):
return { return {"acc": mean}
"acc": mean
}
@staticmethod @staticmethod
def convert_choice(choice): def convert_choice(choice):
return choice[0].lower() + choice[1:] return choice[0].lower() + choice[1:]
class MultiRC(Task): class MultiRC(PromptSourceTask):
VERSION = 1 VERSION = 1
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
DATASET_NAME = "multirc" DATASET_NAME = "multirc"
...@@ -253,45 +167,19 @@ class MultiRC(Task): ...@@ -253,45 +167,19 @@ class MultiRC(Task):
def validation_docs(self): def validation_docs(self):
return self.dataset["validation"] return self.dataset["validation"]
def doc_to_text(self, doc):
return f"{doc['paragraph']}\nQuestion: {doc['question']}\nAnswer:"
def doc_to_target(self, doc):
return " " + self.format_answer(answer=doc["answer"], label=doc["label"])
@staticmethod
def format_answer(answer, label):
label_str = "yes" if label else "no"
return f"{answer}\nIs the answer correct? {label_str}"
def construct_requests(self, doc, ctx):
true_choice = self.format_answer(answer=doc["answer"], label=True)
false_choice = self.format_answer(answer=doc["answer"], label=False)
ll_true_choice, _ = rf.loglikelihood(ctx, f' {true_choice}')
ll_false_choice, _ = rf.loglikelihood(ctx, f' {false_choice}')
return ll_true_choice, ll_false_choice
def process_results(self, doc, results): def process_results(self, doc, results):
ll_true_choice, ll_false_choice = results ll_true_choice, ll_false_choice = results
pred = ll_true_choice > ll_false_choice pred = ll_true_choice > ll_false_choice
return { return {"acc": (pred, doc)}
"acc": (pred, doc)
}
def higher_is_better(self): def higher_is_better(self):
return { return {"acc": True}
"acc": True
}
def aggregation(self): def aggregation(self):
return { return {"acc": acc_all}
"acc": acc_all
}
class ReCoRD(Task): class ReCoRD(PromptSourceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
DATASET_NAME = "record" DATASET_NAME = "record"
...@@ -311,56 +199,31 @@ class ReCoRD(Task): ...@@ -311,56 +199,31 @@ class ReCoRD(Task):
if self._training_docs is None: if self._training_docs is None:
self._training_docs = [] self._training_docs = []
for doc in self.dataset["train"]: for doc in self.dataset["train"]:
self._training_docs.append(self._process_doc(doc)) self._training_docs.append(doc)
return self._training_docs return self._training_docs
def validation_docs(self): def validation_docs(self):
# See: training_docs # See: training_docs
for doc in self.dataset["validation"]: for doc in self.dataset["validation"]:
yield self._process_doc(doc) yield doc
@classmethod
def _process_doc(cls, doc):
return {
"passage": doc["passage"],
"query": doc["query"],
"entities": sorted(list(set(doc["entities"]))),
"answers": sorted(list(set(doc["answers"]))),
}
def doc_to_text(self, doc):
initial_text, *highlights = doc["passage"].strip().split("\n@highlight\n")
text = initial_text + "\n\n"
for highlight in highlights:
text += f" - {highlight}.\n"
return text
@classmethod
def format_answer(cls, query, entity):
return f' - {query}'.replace("@placeholder", entity)
def doc_to_target(self, doc):
# We only output the first correct entity in a doc
return self.format_answer(query=doc["query"], entity=doc["answers"][0])
def construct_requests(self, doc, ctx):
requests = [
rf.loglikelihood(ctx, self.format_answer(query=doc["query"], entity=entity))
for entity in doc["entities"]
]
return requests
def process_results(self, doc, results): def process_results(self, doc, results):
# ReCoRD's evaluation is actually deceptively simple: # ReCoRD's evaluation is actually deceptively simple:
# - Pick the maximum likelihood prediction entity # - Pick the maximum likelihood prediction entity
# - Evaluate the accuracy and token F1 PER EXAMPLE # - Evaluate the accuracy and token F1 PER EXAMPLE
# - Average over all examples # - Average over all examples
# TODO (jon-tow): Look at result
max_idx = np.argmax(np.array([result[0] for result in results])) max_idx = np.argmax(np.array([result[0] for result in results]))
prediction = doc["entities"][max_idx] prediction = doc["entities"][max_idx]
gold_label_set = doc["answers"] gold_label_set = doc["answers"]
f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set) f1 = metric_max_over_ground_truths(
em = metric_max_over_ground_truths(squad_metrics.compute_exact, prediction, gold_label_set) squad_metrics.compute_f1, prediction, gold_label_set
)
em = metric_max_over_ground_truths(
squad_metrics.compute_exact, prediction, gold_label_set
)
return { return {
"f1": f1, "f1": f1,
...@@ -380,7 +243,7 @@ class ReCoRD(Task): ...@@ -380,7 +243,7 @@ class ReCoRD(Task):
} }
class WordsInContext(Task): class WordsInContext(PromptSourceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
DATASET_NAME = "wic" DATASET_NAME = "wic"
...@@ -402,50 +265,19 @@ class WordsInContext(Task): ...@@ -402,50 +265,19 @@ class WordsInContext(Task):
def validation_docs(self): def validation_docs(self):
return self.dataset["validation"] return self.dataset["validation"]
def doc_to_text(self, doc):
return "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the" \
" two sentences above?\nAnswer:".format(
doc["sentence1"],
doc["sentence2"],
doc["sentence1"][doc["start1"]:doc["end1"]],
)
def doc_to_target(self, doc):
return " {}".format({0: "no", 1: "yes"}[doc["label"]])
def construct_requests(self, doc, ctx):
ll_yes, _ = rf.loglikelihood(ctx, ' yes')
ll_no, _ = rf.loglikelihood(ctx, ' no')
return ll_yes, ll_no
def process_results(self, doc, results):
ll_yes, ll_no = results
gold = doc["label"]
acc = 1. if (ll_yes > ll_no) == gold else 0.
return {
"acc": acc
}
def higher_is_better(self): def higher_is_better(self):
return { return {"acc": True}
"acc": True
}
def aggregation(self): def aggregation(self):
return { return {"acc": mean}
"acc": mean
}
class SGWinogradSchemaChallenge(Task): class SGWinogradSchemaChallenge(PromptSourceTask):
VERSION = 0 VERSION = 0
# Note: This implementation differs from Fig G.32 because this is the SuperGLUE, # Note: This implementation differs from Fig G.32 because this is the SuperGLUE,
# binary version of the task. # binary version of the task.
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
DATASET_NAME = "wsc" DATASET_NAME = "wsc.fixed"
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -461,56 +293,51 @@ class SGWinogradSchemaChallenge(Task): ...@@ -461,56 +293,51 @@ class SGWinogradSchemaChallenge(Task):
if self._training_docs is None: if self._training_docs is None:
# GPT-3 Paper's format only uses positive examples for fewshot "training" # GPT-3 Paper's format only uses positive examples for fewshot "training"
self._training_docs = [ self._training_docs = [
doc for doc in doc for doc in self.dataset["train"] if doc["label"]
self.dataset["train"]
if doc["label"]
] ]
return self._training_docs return self._training_docs
def validation_docs(self): def validation_docs(self):
return self.dataset["validation"] return self.dataset["validation"]
def doc_to_text(self, doc): def higher_is_better(self):
raw_passage = doc["text"] return {"acc": True}
# NOTE: HuggingFace span indices are word-based not character-based.
pre = " ".join(raw_passage.split()[:doc["span2_index"]])
post = raw_passage[len(pre) + len(doc["span2_text"]) + 1:]
passage = general_detokenize(pre + " *{}*".format(doc['span2_text']) + post)
noun = doc["span1_text"]
pronoun = doc["span2_text"]
text = (
f"Passage: {passage}\n"
+ f"Question: In the passage above, does the pronoun \"*{pronoun}*\" refer to \"*{noun}*\"?\n"
+ "Answer:"
)
return text
def doc_to_target(self, doc): def aggregation(self):
return " " + yesno(doc['label']) return {"acc": mean}
def construct_requests(self, doc, ctx):
ll_yes, _ = rf.loglikelihood(ctx, ' yes') class WinogenderSchemaDiagnostics(PromptSourceTask):
ll_no, _ = rf.loglikelihood(ctx, ' no') VERSION = 0
DATASET_PATH = "super_glue"
DATASET_NAME = "axg"
return ll_yes, ll_no def has_training_docs(self):
return False
def process_results(self, doc, results): def has_validation_docs(self):
ll_yes, ll_no = results return False
gold = doc["label"]
acc = 1. if (ll_yes > ll_no) == gold else 0. def has_test_docs(self):
return True
return { def test_docs(self):
"acc": acc return self.dataset["test"]
}
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self): class BroadcoverageDiagnostics(PromptSourceTask):
return { VERSION = 0
"acc": mean DATASET_PATH = "super_glue"
} DATASET_NAME = "axb"
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def test_docs(self):
return self.dataset["test"]
"""
Gender Bias in Coreference Resolution: Evaluation and Debiasing Methods
https://arxiv.org/abs/1804.06876
Winograd-schema evaluation of gendered coreference resolution.
The dataset contains pro-stereotypical and anti-stereotypical parts. The difference in accuracy for those two subsets
quatnifies bias.
Homepage: https://uclanlp.github.io/corefBias/overview
"""
from lm_eval.base import PromptSourceTask, mean
import transformers.data.metrics.squad_metrics as squad_metrics
_CITATION = """
@inproceedings{zhao-etal-2018-gender,
title = "Gender Bias in Coreference Resolution: Evaluation and Debiasing Methods",
author = "Zhao, Jieyu and
Wang, Tianlu and
Yatskar, Mark and
Ordonez, Vicente and
Chang, Kai-Wei",
booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)",
month = jun,
year = "2018",
address = "New Orleans, Louisiana",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/N18-2003",
doi = "10.18653/v1/N18-2003",
pages = "15--20",
abstract = "In this paper, we introduce a new benchmark for co-reference resolution focused on gender bias, WinoBias. Our corpus contains Winograd-schema style sentences with entities corresponding to people referred by their occupation (e.g. the nurse, the doctor, the carpenter). We demonstrate that a rule-based, a feature-rich, and a neural coreference system all link gendered pronouns to pro-stereotypical entities with higher accuracy than anti-stereotypical entities, by an average difference of 21.1 in F1 score. Finally, we demonstrate a data-augmentation approach that, in combination with existing word-embedding debiasing techniques, removes the bias demonstrated by these systems in WinoBias without significantly affecting their performance on existing datasets.",
}
"""
class WinoBias(PromptSourceTask):
VERSION = 0
DATASET_PATH = "wino_bias"
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
pass
def validation_docs(self):
return self.dataset["validation"]
def test_docs(self):
return self.dataset["test"]
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
target = self.doc_to_target(doc).strip()
pred = " ".join(results[0].strip().split(" ")[:len(target.split(" "))])
# The original paper uses F1. In the case of exactly one predicted and one gold mention,
# F1 and exact match are equivalent.
em = squad_metrics.compute_exact(target, pred)
out = {"em": em}
if self.save_examples:
example = {"target": target, "pred": pred}
return out, example
return out
def aggregation(self):
"""
:returns: {str: [metric_score] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metric scores
"""
return {'em': mean}
def higher_is_better(self):
return {'em': True}
class WinoBiasType1Pro(WinoBias):
DATASET_NAME = "type1_pro"
class WinoBiasType1Anti(WinoBias):
DATASET_NAME = "type1_anti"
class WinoBiasType2Pro(WinoBias):
DATASET_NAME = "type2_pro"
class WinoBiasType2Anti(WinoBias):
DATASET_NAME = "type2_anti"
...@@ -146,6 +146,19 @@ class Reorderer: ...@@ -146,6 +146,19 @@ class Reorderer:
return res return res
def flatten(d, parent_key='', sep='_'):
# From: https://stackoverflow.com/a/6027615
items = []
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, collections.MutableMapping):
items.extend(flatten(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
def positional_deprecated(fn): def positional_deprecated(fn):
""" """
A decorator to nudge users into passing only keyword args (`kwargs`) to the A decorator to nudge users into passing only keyword args (`kwargs`) to the
......
...@@ -9,18 +9,18 @@ logging.getLogger("openai").setLevel(logging.WARNING) ...@@ -9,18 +9,18 @@ logging.getLogger("openai").setLevel(logging.WARNING)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--model', required=True) parser.add_argument("--model", required=True)
parser.add_argument('--model_args', default="") parser.add_argument("--model_args", default="")
parser.add_argument('--tasks', default="all_tasks") parser.add_argument("--tasks", default="all_tasks")
parser.add_argument('--provide_description', action="store_true") parser.add_argument("--provide_description", action="store_true")
parser.add_argument('--num_fewshot', type=int, default=0) parser.add_argument("--num_fewshot", type=int, default=0)
parser.add_argument('--batch_size', type=int, default=None) parser.add_argument("--batch_size", type=int, default=None)
parser.add_argument('--device', type=str, default=None) parser.add_argument("--device", type=str, default=None)
parser.add_argument('--output_path', default=None) parser.add_argument("--output_path", default=None)
parser.add_argument('--limit', type=int, default=None) parser.add_argument("--limit", type=int, default=None)
parser.add_argument('--no_cache', action="store_true") parser.add_argument("--no_cache", action="store_true")
parser.add_argument('--description_dict_path', default=None) parser.add_argument("--description_dict_path", default=None)
parser.add_argument('--check_integrity', action="store_true") parser.add_argument("--check_integrity", action="store_true")
return parser.parse_args() return parser.parse_args()
...@@ -29,7 +29,9 @@ def main(): ...@@ -29,7 +29,9 @@ def main():
assert not args.provide_description # not implemented assert not args.provide_description # not implemented
if args.limit: if args.limit:
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.") print(
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
)
if args.tasks == "all_tasks": if args.tasks == "all_tasks":
task_names = tasks.ALL_TASKS task_names = tasks.ALL_TASKS
...@@ -38,7 +40,7 @@ def main(): ...@@ -38,7 +40,7 @@ def main():
description_dict = {} description_dict = {}
if args.description_dict_path: if args.description_dict_path:
with open(args.description_dict_path, 'r') as f: with open(args.description_dict_path, "r") as f:
description_dict = json.load(f) description_dict = json.load(f)
results = evaluator.simple_evaluate( results = evaluator.simple_evaluate(
...@@ -51,9 +53,10 @@ def main(): ...@@ -51,9 +53,10 @@ def main():
no_cache=args.no_cache, no_cache=args.no_cache,
limit=args.limit, limit=args.limit,
description_dict=description_dict, description_dict=description_dict,
check_integrity=args.check_integrity check_integrity=args.check_integrity,
) )
print(results)
dumped = json.dumps(results, indent=2) dumped = json.dumps(results, indent=2)
print(dumped) print(dumped)
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment