Commit 18c0fa29 authored by cardy20's avatar cardy20
Browse files

conflict solved

parents 09915adf 0542d35d
""" """
Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering
https://arxiv.org/pdf/1809.02789.pdf https://arxiv.org/pdf/1809.02789.pdf
OpenBookQA is a question-answering dataset modeled after open book exams for OpenBookQA is a question-answering dataset modeled after open book exams for
assessing human understanding of a subject. It consists of 5,957 multiple-choice assessing human understanding of a subject. It consists of 5,957 multiple-choice
elementary-level science questions (4,957 train, 500 dev, 500 test), which probe elementary-level science questions (4,957 train, 500 dev, 500 test), which probe
the understanding of a small “book” of 1,326 core science facts and the application the understanding of a small “book” of 1,326 core science facts and the application
of these facts to novel situations. For training, the dataset includes a mapping of these facts to novel situations. For training, the dataset includes a mapping
from each question to the core science fact it was designed to probe. Answering from each question to the core science fact it was designed to probe. Answering
OpenBookQA questions requires additional broad common knowledge, not contained OpenBookQA questions requires additional broad common knowledge, not contained
in the book. The questions, by design, are answered incorrectly by both a retrieval- in the book. The questions, by design, are answered incorrectly by both a retrieval-
based algorithm and a word co-occurrence algorithm. based algorithm and a word co-occurrence algorithm.
Homepage: https://allenai.org/data/open-book-qa Homepage: https://allenai.org/data/open-book-qa
""" """
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
_CITATION = """ _CITATION = """
@inproceedings{OpenBookQA2018, @inproceedings{OpenBookQA2018,
title={Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering}, title={Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering},
author={Todor Mihaylov and Peter Clark and Tushar Khot and Ashish Sabharwal}, author={Todor Mihaylov and Peter Clark and Tushar Khot and Ashish Sabharwal},
booktitle={EMNLP}, booktitle={EMNLP},
year={2018} year={2018}
} }
""" """
class OpenBookQA(MultipleChoiceTask): class OpenBookQA(MultipleChoiceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "openbookqa" DATASET_PATH = "openbookqa"
DATASET_NAME = "main" DATASET_NAME = "main"
def has_training_docs(self): def has_training_docs(self):
return True return True
def has_validation_docs(self): def has_validation_docs(self):
return True return True
def has_test_docs(self): def has_test_docs(self):
return True return True
def training_docs(self): def training_docs(self):
if self._training_docs is None: if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"])) self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs return self._training_docs
def validation_docs(self): def validation_docs(self):
return map(self._process_doc, self.dataset["validation"]) return map(self._process_doc, self.dataset["validation"])
def test_docs(self): def test_docs(self):
return map(self._process_doc, self.dataset["test"]) return map(self._process_doc, self.dataset["test"])
def _process_doc(self, doc): def _process_doc(self, doc):
out_doc = { out_doc = {
"id": doc["id"], "id": doc["id"],
"query": doc["question_stem"], "query": doc["question_stem"],
"choices": doc["choices"]["text"], "choices": doc["choices"]["text"],
"gold": ["A", "B", "C", "D"].index(doc["answerKey"].strip()), "gold": ["A", "B", "C", "D"].index(doc["answerKey"].strip()),
} }
return out_doc return out_doc
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["query"]
...@@ -5,7 +5,7 @@ https://arxiv.org/pdf/1911.11641.pdf ...@@ -5,7 +5,7 @@ https://arxiv.org/pdf/1911.11641.pdf
Physical Interaction: Question Answering (PIQA) is a physical commonsense Physical Interaction: Question Answering (PIQA) is a physical commonsense
reasoning and a corresponding benchmark dataset. PIQA was designed to investigate reasoning and a corresponding benchmark dataset. PIQA was designed to investigate
the physical knowledge of existing models. To what extent are current approaches the physical knowledge of existing models. To what extent are current approaches
actually learning about the world? actually learning about the world?
Homepage: https://yonatanbisk.com/piqa/ Homepage: https://yonatanbisk.com/piqa/
""" """
...@@ -58,3 +58,9 @@ class PiQA(MultipleChoiceTask): ...@@ -58,3 +58,9 @@ class PiQA(MultipleChoiceTask):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "Question: " + doc["goal"] + "\nAnswer:" return "Question: " + doc["goal"] + "\nAnswer:"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["goal"]
...@@ -52,22 +52,29 @@ class PROST(MultipleChoiceTask): ...@@ -52,22 +52,29 @@ class PROST(MultipleChoiceTask):
def test_docs(self): def test_docs(self):
return map(self._process_doc, self.dataset["test"]) return map(self._process_doc, self.dataset["test"])
def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None): def fewshot_context(
assert num_fewshot == 0, 'PROST is designed to probe models in a zero-shot fashion only.' self, doc, num_fewshot, provide_description=None, rnd=None, description=None
):
assert (
num_fewshot == 0
), "PROST is designed to probe models in a zero-shot fashion only."
return super().fewshot_context( return super().fewshot_context(
doc=doc, doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
num_fewshot=num_fewshot,
rnd=rnd,
description=description
) )
def _process_doc(self, doc): def _process_doc(self, doc):
out_doc = { out_doc = {
"query": f"{doc['context']}\nQuestion: {doc['ex_question']}\nAnswer:", "query": f"{doc['context']}\nQuestion: {doc['ex_question']}\nAnswer:",
"choices": [doc['A'], doc['B'], doc['C'], doc['D']], "choices": [doc["A"], doc["B"], doc["C"], doc["D"]],
"gold": doc['label'], "gold": doc["label"],
} }
return out_doc return out_doc
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["query"]
...@@ -3,14 +3,14 @@ PubMedQA: A Dataset for Biomedical Research Question Answering ...@@ -3,14 +3,14 @@ PubMedQA: A Dataset for Biomedical Research Question Answering
https://arxiv.org/pdf/1909.06146.pdf https://arxiv.org/pdf/1909.06146.pdf
PubMedQA is a novel biomedical question answering (QA) dataset collected from PubMedQA is a novel biomedical question answering (QA) dataset collected from
PubMed abstracts. The task of PubMedQA is to answer research questions with PubMed abstracts. The task of PubMedQA is to answer research questions with
yes/no/maybe (e.g.: Do preoperative statins reduce atrial fibrillation after yes/no/maybe (e.g.: Do preoperative statins reduce atrial fibrillation after
coronary artery bypass grafting?) using the corresponding abstracts. PubMedQA coronary artery bypass grafting?) using the corresponding abstracts. PubMedQA
has 1k expert-annotated, 61.2k unlabeled and 211.3k artificially generated QA has 1k expert-annotated, 61.2k unlabeled and 211.3k artificially generated QA
instances. Each PubMedQA instance is composed of (1) a question which is either instances. Each PubMedQA instance is composed of (1) a question which is either
an existing research article title or derived from one, (2) a context which is an existing research article title or derived from one, (2) a context which is
the corresponding abstract without its conclusion, (3) a long answer, which is the corresponding abstract without its conclusion, (3) a long answer, which is
the conclusion of the abstract and, presumably, answers the research question, the conclusion of the abstract and, presumably, answers the research question,
and (4) a yes/no/maybe answer which summarizes the conclusion. and (4) a yes/no/maybe answer which summarizes the conclusion.
Homepage: https://pubmedqa.github.io/ Homepage: https://pubmedqa.github.io/
...@@ -53,16 +53,20 @@ class Pubmed_QA(Task): ...@@ -53,16 +53,20 @@ class Pubmed_QA(Task):
def doc_to_text(self, doc): def doc_to_text(self, doc):
ctxs = "\n".join(doc["context"]["contexts"]) ctxs = "\n".join(doc["context"]["contexts"])
return "Abstract: {}\nQuestion: {}\nAnswer:".format( return "Abstract: {}\nQuestion: {}\nAnswer:".format(
ctxs, ctxs, doc["question"], doc["final_decision"]
doc["question"],
doc["final_decision"]
) )
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["question"] + " " + "\n".join(doc["context"]["contexts"])
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " {}".format(doc["final_decision"]) return " {}".format(doc["final_decision"])
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns """Uses RequestFactory to construct Requests and returns
an iterable of Requests which will be sent to the LM. an iterable of Requests which will be sent to the LM.
""" """
ll_yes, _ = rf.loglikelihood(ctx, " yes") ll_yes, _ = rf.loglikelihood(ctx, " yes")
...@@ -75,15 +79,11 @@ class Pubmed_QA(Task): ...@@ -75,15 +79,11 @@ class Pubmed_QA(Task):
ll_yes, ll_no, ll_maybe = results ll_yes, ll_no, ll_maybe = results
pred = np.argmax(results) pred = np.argmax(results)
return { return {
"acc": ["yes", "no", "maybe"][pred] == gold, "acc": ["yes", "no", "maybe"][pred] == gold,
} }
def aggregation(self): def aggregation(self):
return { return {"acc": mean}
"acc" : mean
}
def higher_is_better(self): def higher_is_better(self):
return { return {"acc": True}
"acc" : True
}
...@@ -3,9 +3,9 @@ QA4MRE 2011-2013: Overview of Question Answering for Machine Reading Evaluation ...@@ -3,9 +3,9 @@ QA4MRE 2011-2013: Overview of Question Answering for Machine Reading Evaluation
https://www.cs.cmu.edu/~./hovy/papers/13CLEF-QA4MRE.pdf https://www.cs.cmu.edu/~./hovy/papers/13CLEF-QA4MRE.pdf
The (English only) QA4MRE challenge which was run as a Lab at CLEF 2011-2013. The (English only) QA4MRE challenge which was run as a Lab at CLEF 2011-2013.
The main objective of this exercise is to develop a methodology for evaluating The main objective of this exercise is to develop a methodology for evaluating
Machine Reading systems through Question Answering and Reading Comprehension Machine Reading systems through Question Answering and Reading Comprehension
Tests. Systems should be able to extract knowledge from large volumes of text Tests. Systems should be able to extract knowledge from large volumes of text
and use this knowledge to answer questions. Four different tasks have been and use this knowledge to answer questions. Four different tasks have been
organized during these years: Main Task, Processing Modality and Negation for organized during these years: Main Task, Processing Modality and Negation for
Machine Reading, Machine Reading of Biomedical Texts about Alzheimer's disease, Machine Reading, Machine Reading of Biomedical Texts about Alzheimer's disease,
...@@ -23,7 +23,7 @@ _CITATION = """ ...@@ -23,7 +23,7 @@ _CITATION = """
booktitle={CLEF}, booktitle={CLEF},
year={2013} year={2013}
} }
""" """ # noqa: W605
class QA4MRE(MultipleChoiceTask): class QA4MRE(MultipleChoiceTask):
...@@ -47,7 +47,7 @@ class QA4MRE(MultipleChoiceTask): ...@@ -47,7 +47,7 @@ class QA4MRE(MultipleChoiceTask):
def _process_doc(self, doc): def _process_doc(self, doc):
choices = doc["answer_options"]["answer_str"] choices = doc["answer_options"]["answer_str"]
out_doc = { out_doc = {
"source": doc["document_str"].strip().replace("\'", "'"), "source": doc["document_str"].strip().replace("'", "'"),
"query": doc["question_str"], "query": doc["question_str"],
"choices": choices, "choices": choices,
"gold": int(doc["correct_answer_id"]) - 1, "gold": int(doc["correct_answer_id"]) - 1,
...@@ -57,6 +57,12 @@ class QA4MRE(MultipleChoiceTask): ...@@ -57,6 +57,12 @@ class QA4MRE(MultipleChoiceTask):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]) return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"])
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["source"] + " " + doc["query"]
class QA4MRE_2011(QA4MRE): class QA4MRE_2011(QA4MRE):
DATASET_NAME = "2011.main.EN" DATASET_NAME = "2011.main.EN"
......
""" """
A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers
https://arxiv.org/abs/2105.03011 https://arxiv.org/abs/2105.03011
...@@ -214,7 +214,7 @@ class QASPER(Task): ...@@ -214,7 +214,7 @@ class QASPER(Task):
""" """
# unanswerable = rf.loglikelihood(ctx, " " + "unanswerable") # unanswerable = rf.loglikelihood(ctx, " " + "unanswerable")
if doc["answer_type"] in ("free form answer"): if doc["answer_type"] in ("free form answer"):
return [rf.greedy_until(ctx, ["\n"])] return [rf.greedy_until(ctx, {'until': ["\n"]})]
elif doc["answer_type"] in ("bool"): elif doc["answer_type"] in ("bool"):
ll_yes, _ = rf.loglikelihood(ctx, " yes") ll_yes, _ = rf.loglikelihood(ctx, " yes")
ll_no, _ = rf.loglikelihood(ctx, " no") ll_no, _ = rf.loglikelihood(ctx, " no")
......
""" """
QuAC: Question Answering in Context QuAC: Question Answering in Context
https://arxiv.org/abs/1808.07036 https://arxiv.org/abs/1808.07036
Question Answering in Context (QuAC) is a dataset for modeling, understanding, and Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
participating in information seeking dialog. Data instances consist of an interactive participating in information seeking dialog. Data instances consist of an interactive
dialog between two crowd workers: (1) a student who poses a sequence of freeform dialog between two crowd workers: (1) a student who poses a sequence of freeform
questions to learn as much as possible about a hidden Wikipedia text, and (2) questions to learn as much as possible about a hidden Wikipedia text, and (2)
a teacher who answers the questions by providing short excerpts (spans) from the text. a teacher who answers the questions by providing short excerpts (spans) from the text.
Homepage: https://quac.ai/ Homepage: https://quac.ai/
""" """
import inspect import inspect
import lm_eval.datasets.quac.quac import lm_eval.datasets.quac.quac
from lm_eval.base import Task from lm_eval.base import Task
_CITATION = """ _CITATION = """
@article{choi2018quac, @article{choi2018quac,
title={Quac: Question answering in context}, title={Quac: Question answering in context},
author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke}, author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},
journal={arXiv preprint arXiv:1808.07036}, journal={arXiv preprint arXiv:1808.07036},
year={2018} year={2018}
} }
""" """
class QuAC(Task): class QuAC(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = inspect.getfile(lm_eval.datasets.quac.quac) DATASET_PATH = inspect.getfile(lm_eval.datasets.quac.quac)
DATASET_NAME = None DATASET_NAME = None
def has_training_docs(self): def has_training_docs(self):
return True return True
def has_validation_docs(self): def has_validation_docs(self):
return True return True
def has_test_docs(self): def has_test_docs(self):
return False return False
def training_docs(self): def training_docs(self):
if self._training_docs is None: if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"])) self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs return self._training_docs
def validation_docs(self): def validation_docs(self):
return map(self._process_doc, self.dataset["validation"]) return map(self._process_doc, self.dataset["validation"])
def test_docs(self): def test_docs(self):
raise NotImplementedError("QuAC has no test docs.") raise NotImplementedError("QuAC has no test docs.")
def _process_doc(self, doc): def _process_doc(self, doc):
doc["title"] = doc['title'] + ' - ' + doc['section_title'] doc["title"] = doc["title"] + " - " + doc["section_title"]
return doc return doc
def doc_to_text(self, doc): def doc_to_text(self, doc):
return 'TITLE: ' + doc['title'] + '\n' + 'PARAGRAPH: ' + doc['paragraph'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: ' return (
"TITLE: "
def doc_to_target(self, doc): + doc["title"]
return doc['answer'] + "\n"
+ "PARAGRAPH: "
def construct_requests(self, doc, ctx): + doc["paragraph"]
""" Uses RequestFactory to construct Requests and returns an iterable of + "\n\n"
Requests which will be sent to the LM. + "Q: "
+ doc["question"]
:param doc: + "\n\n"
The document as returned from training_docs, validation_docs, or test_docs. + "A: "
:param ctx: str )
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question def should_decontaminate(self):
part of the document for `doc`. return True
"""
# TODO: implement evaluation. def doc_to_decontamination_query(self, doc):
raise NotImplementedError('Evaluation not implemented') return doc["paragraph"]
def process_results(self, doc, results): def doc_to_target(self, doc):
"""Take a single document and the LM results and evaluates, returning a return doc["answer"]
dict where keys are the names of submetrics and values are the values of
the metric for that one document def construct_requests(self, doc, ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of
:param doc: Requests which will be sent to the LM.
The document as returned from training_docs, validation_docs, or test_docs.
:param results: :param doc:
The results of the requests created in construct_requests. The document as returned from training_docs, validation_docs, or test_docs.
""" :param ctx: str
# TODO: implement evaluation. The context string, generated by fewshot_context. This includes the natural
raise NotImplementedError('Evaluation not implemented') language description, as well as the few shot examples, and the question
part of the document for `doc`.
def aggregation(self): """
""" # TODO: implement evaluation.
:returns: {str: [float] -> float} raise NotImplementedError("Evaluation not implemented")
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics def process_results(self, doc, results):
""" """Take a single document and the LM results and evaluates, returning a
# TODO: implement evaluation. dict where keys are the names of submetrics and values are the values of
raise NotImplementedError('Evaluation not implemented') the metric for that one document
def higher_is_better(self): :param doc:
""" The document as returned from training_docs, validation_docs, or test_docs.
:returns: {str: bool} :param results:
A dictionary where keys are the names of submetrics and values are The results of the requests created in construct_requests.
whether a higher value of the submetric is better """
""" # TODO: implement evaluation.
# TODO: implement evaluation. raise NotImplementedError("Evaluation not implemented")
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError("Evaluation not implemented")
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError("Evaluation not implemented")
...@@ -20,7 +20,7 @@ _CITATION = """ ...@@ -20,7 +20,7 @@ _CITATION = """
@article{lai2017large, @article{lai2017large,
title={RACE: Large-scale ReAding Comprehension Dataset From Examinations}, title={RACE: Large-scale ReAding Comprehension Dataset From Examinations},
author={Lai, Guokun and Xie, Qizhe and Liu, Hanxiao and Yang, Yiming and Hovy, Eduard}, author={Lai, Guokun and Xie, Qizhe and Liu, Hanxiao and Yang, Yiming and Hovy, Eduard},
journal={arXiv preprint arXiv:1704.04683}, journal={arXiv preprint arXiv:1704.04683},
year={2017} year={2017}
} }
""" """
...@@ -40,7 +40,7 @@ class RACE(Task): ...@@ -40,7 +40,7 @@ class RACE(Task):
DATASET_NAME = "high" DATASET_NAME = "high"
cache = {} cache = {}
letter_to_num = {'A': 0, 'B': 1, 'C': 2, 'D': 3} letter_to_num = {"A": 0, "B": 1, "C": 2, "D": 3}
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -59,17 +59,27 @@ class RACE(Task): ...@@ -59,17 +59,27 @@ class RACE(Task):
# is shown that one document is made per passage. # is shown that one document is made per passage.
r = collections.defaultdict(list) r = collections.defaultdict(list)
for item in datasets.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)[set]: for item in datasets.load_dataset(
r[item['article']].append(item) path=self.DATASET_PATH, name=self.DATASET_NAME
)[set]:
res = list(r.values() >> each(lambda x: { r[item["article"]].append(item)
'article': x[0]['article'],
'problems': x >> each(lambda y: { res = list(
'question': y['question'], r.values()
'answer': y['answer'], >> each(
'options': y['options'], lambda x: {
}) "article": x[0]["article"],
})) "problems": x
>> each(
lambda y: {
"question": y["question"],
"answer": y["answer"],
"options": y["options"],
}
),
}
)
)
self.cache[set] = res self.cache[set] = res
return res return res
...@@ -85,49 +95,56 @@ class RACE(Task): ...@@ -85,49 +95,56 @@ class RACE(Task):
@classmethod @classmethod
def get_answer_option(cls, problem): def get_answer_option(cls, problem):
answer = cls.letter_to_num[problem['answer']] answer = cls.letter_to_num[problem["answer"]]
return problem['options'][answer] return problem["options"][answer]
@classmethod @classmethod
def last_problem(cls, doc): def last_problem(cls, doc):
return doc['problems'][-1] return doc["problems"][-1]
def doc_to_text(self, doc): def doc_to_text(self, doc):
text = 'Article: ' + doc['article'] + '\n\n' text = "Article: " + doc["article"] + "\n\n"
for problem in doc['problems'][:-1]: for problem in doc["problems"][:-1]:
if problem['question'][-6:] == ' _ .': if problem["question"][-6:] == " _ .":
text += problem['question'][-5:] + self.get_answer_option(problem) + '\n' text += (
problem["question"][-5:] + self.get_answer_option(problem) + "\n"
)
else: else:
question = 'Question: ' + problem['question'] + '\n' question = "Question: " + problem["question"] + "\n"
answer = 'Answer: ' + self.get_answer_option(problem) + '\n' answer = "Answer: " + self.get_answer_option(problem) + "\n"
text += question + answer text += question + answer
text += self.last_problem(doc)['question'] text += self.last_problem(doc)["question"]
return text return text
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["article"]
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + self.get_answer_option(self.last_problem(doc)) return " " + self.get_answer_option(self.last_problem(doc))
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
:param doc: :param doc:
The document as returned from training_docs, validation_docs, or test_docs. The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str :param ctx: str
The context string, generated by fewshot_context. This includes the natural The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question language description, as well as the few shot examples, and the question
part of the document for `doc`. part of the document for `doc`.
""" """
problem = self.last_problem(doc) problem = self.last_problem(doc)
ll_choices = [ ll_choices = [
rf.loglikelihood(ctx, " " + problem['options'][i])[0] rf.loglikelihood(ctx, " " + problem["options"][i])[0] for i in range(4)
for i in range(4)
] ]
return ll_choices return ll_choices
def process_results(self, doc, results): def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a """Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of dict where keys are the names of submetrics and values are the values of
the metric for that one document the metric for that one document
:param doc: :param doc:
...@@ -135,28 +152,22 @@ class RACE(Task): ...@@ -135,28 +152,22 @@ class RACE(Task):
:param results: :param results:
The results of the requests created in construct_requests. The results of the requests created in construct_requests.
""" """
gold = self.letter_to_num[self.last_problem(doc)['answer']] gold = self.letter_to_num[self.last_problem(doc)["answer"]]
pred = np.argmax(results) pred = np.argmax(results)
return { return {"acc": int(pred == gold)}
"acc": int(pred == gold)
}
def aggregation(self): def aggregation(self):
""" """
:returns: {str: [float] -> float} :returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics functions that aggregate a list of metrics
""" """
return { return {"acc": mean}
"acc": mean
}
def higher_is_better(self): def higher_is_better(self):
""" """
:returns: {str: bool} :returns: {str: bool}
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better whether a higher value of the submetric is better
""" """
return { return {"acc": True}
"acc": True
}
...@@ -59,11 +59,19 @@ class SATAnalogies(MultipleChoiceTask): ...@@ -59,11 +59,19 @@ class SATAnalogies(MultipleChoiceTask):
def _process_doc(self, doc): def _process_doc(self, doc):
return { return {
'source': doc['source'], "source": doc["source"],
'query': doc['stem'].split(' ')[:2], "query": doc["stem"].split(" ")[:2],
'choices': ["{} is to {}".format(*c.split(' ')[:2]) for c in doc["choices"]], "choices": [
'gold': ['a', 'b', 'c', 'd', 'e'].index(doc['solution'].strip()), "{} is to {}".format(*c.split(" ")[:2]) for c in doc["choices"]
],
"gold": ["a", "b", "c", "d", "e"].index(doc["solution"].strip()),
} }
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "{} is to {} as".format(*doc['query']) return "{} is to {} as".format(*doc["query"])
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["source"] + "\n" + " ".join(doc["query"])
...@@ -54,10 +54,10 @@ class SciQ(MultipleChoiceTask): ...@@ -54,10 +54,10 @@ class SciQ(MultipleChoiceTask):
doc["distractor3"], doc["distractor3"],
doc["correct_answer"], doc["correct_answer"],
] ]
src = doc['support'] src = doc["support"]
out_doc = { out_doc = {
"source": src, "source": src,
"query": doc['question'], "query": doc["question"],
"choices": choices, "choices": choices,
"gold": 3, "gold": 3,
} }
...@@ -65,3 +65,9 @@ class SciQ(MultipleChoiceTask): ...@@ -65,3 +65,9 @@ class SciQ(MultipleChoiceTask):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]).strip() return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]).strip()
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["source"] + " " + doc["query"]
""" """
Know What You Don’t Know: Unanswerable Questions for SQuAD Know What You Don’t Know: Unanswerable Questions for SQuAD
https://arxiv.org/pdf/1806.03822.pdf https://arxiv.org/pdf/1806.03822.pdf
Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
consisting of questions posed by crowdworkers on a set of Wikipedia articles, consisting of questions posed by crowdworkers on a set of Wikipedia articles,
where the answer to every question is a segment of text, or span, from the where the answer to every question is a segment of text, or span, from the
corresponding reading passage, or the question might be unanswerable. corresponding reading passage, or the question might be unanswerable.
SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
questions written adversarially by crowdworkers to look similar to answerable ones. questions written adversarially by crowdworkers to look similar to answerable ones.
To do well on SQuAD2.0, systems must not only answer questions when possible, but To do well on SQuAD2.0, systems must not only answer questions when possible, but
also determine when no answer is supported by the paragraph and abstain from answering. also determine when no answer is supported by the paragraph and abstain from answering.
Homepage: https://rajpurkar.github.io/SQuAD-explorer/ Homepage: https://rajpurkar.github.io/SQuAD-explorer/
""" """
import datasets import datasets
from math import exp from math import exp
from lm_eval.base import rf, Task from lm_eval.base import rf, Task
from functools import partial from functools import partial
from packaging import version from packaging import version
_CITATION = """ _CITATION = """
@misc{rajpurkar2018know, @misc{rajpurkar2018know,
title={Know What You Don't Know: Unanswerable Questions for SQuAD}, title={Know What You Don't Know: Unanswerable Questions for SQuAD},
author={Pranav Rajpurkar and Robin Jia and Percy Liang}, author={Pranav Rajpurkar and Robin Jia and Percy Liang},
year={2018}, year={2018},
eprint={1806.03822}, eprint={1806.03822},
archivePrefix={arXiv}, archivePrefix={arXiv},
primaryClass={cs.CL} primaryClass={cs.CL}
} }
""" """
def _squad_metric(predictions, references): def _squad_metric(predictions, references):
squad_metric = datasets.load_metric("squad_v2") squad_metric = datasets.load_metric("squad_v2")
return squad_metric.compute(predictions=predictions, references=references) return squad_metric.compute(predictions=predictions, references=references)
def _squad_agg(key, items): def _squad_agg(key, items):
predictions, references = zip(*items) predictions, references = zip(*items)
return _squad_metric(predictions=predictions, references=references)[key] return _squad_metric(predictions=predictions, references=references).get(key, 0)
class SQuAD2(Task): class SQuAD2(Task):
VERSION = 1 VERSION = 1
DATASET_PATH = "squad_v2" DATASET_PATH = "squad_v2"
DATASET_NAME = None DATASET_NAME = None
# HF changed squad on us so we have to make sure we aren't running the old one # HF changed squad on us so we have to make sure we aren't running the old one
assert version.parse(datasets.__version__) >= version.parse("1.11.0"), "datasets v1.11.0 or later required for SQuAD" assert version.parse(datasets.__version__) >= version.parse(
"1.11.0"
def has_training_docs(self): ), "datasets v1.11.0 or later required for SQuAD"
return True
def has_training_docs(self):
def has_validation_docs(self): return True
return True
def has_validation_docs(self):
def has_test_docs(self): return True
return False
def has_test_docs(self):
def training_docs(self): return False
return self.dataset["train"]
def training_docs(self):
def validation_docs(self): return self.dataset["train"]
return self.dataset["validation"]
def validation_docs(self):
def doc_to_text(self, doc): return self.dataset["validation"]
return 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Question: ' + doc['question'] + '\n\n' + 'Answer:'
def doc_to_text(self, doc):
def doc_to_target(self, doc): return (
answer_list = doc['answers']['text'] "Title: "
if len(answer_list) > 0: + doc["title"]
answer = answer_list[0] + "\n\n"
else: + "Background: "
answer = 'unanswerable' + doc["context"]
return " " + answer + "\n\n"
+ "Question: "
def construct_requests(self, doc, ctx): + doc["question"]
""" Uses RequestFactory to construct Requests and returns an iterable of + "\n\n"
Requests which will be sent to the LM. + "Answer:"
)
:param doc:
The document as returned from training_docs, validation_docs, or test_docs. def should_decontaminate(self):
:param ctx: str return True
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question def doc_to_decontamination_query(self, doc):
part of the document for `doc`. return doc["context"]
"""
continuation = rf.greedy_until(ctx, ['\n']) def doc_to_target(self, doc):
is_unanswerable = rf.loglikelihood(ctx, " " + "unanswerable") answer_list = doc["answers"]["text"]
return continuation, is_unanswerable if len(answer_list) > 0:
answer = answer_list[0]
def process_results(self, doc, results): else:
"""Take a single document and the LM results and evaluates, returning a answer = "unanswerable"
dict where keys are the names of submetrics and values are the values of return " " + answer
the metric for that one document
def construct_requests(self, doc, ctx):
:param doc: """Uses RequestFactory to construct Requests and returns an iterable of
The document as returned from training_docs, validation_docs, or test_docs. Requests which will be sent to the LM.
:param results:
The results of the requests created in construct_requests. :param doc:
""" The document as returned from training_docs, validation_docs, or test_docs.
continuation, (logprob_unanswerable, _) = results :param ctx: str
The context string, generated by fewshot_context. This includes the natural
no_answer_probability = exp(logprob_unanswerable) language description, as well as the few shot examples, and the question
part of the document for `doc`.
predictions = { """
'id': doc['id'], continuation = rf.greedy_until(ctx, {'until': ["\n"]})
'prediction_text': continuation, is_unanswerable = rf.loglikelihood(ctx, " " + "unanswerable")
'no_answer_probability': no_answer_probability, return continuation, is_unanswerable
}
def process_results(self, doc, results):
references = { """Take a single document and the LM results and evaluates, returning a
'id': doc['id'], dict where keys are the names of submetrics and values are the values of
'answers': doc['answers'], the metric for that one document
}
:param doc:
return { The document as returned from training_docs, validation_docs, or test_docs.
'exact': (predictions, references), # Exact match (the normalized answer exactly match the gold answer) :param results:
'f1': (predictions, references), # The F-score of predicted tokens versus the gold answer The results of the requests created in construct_requests.
'HasAns_exact': (predictions, references), # Exact match (the normalized answer exactly match the gold answer) """
'HasAns_f1': (predictions, references), # The F-score of predicted tokens versus the gold answer continuation, (logprob_unanswerable, _) = results
'NoAns_exact': (predictions, references), # Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1': (predictions, references), # The F-score of predicted tokens versus the gold answer no_answer_probability = exp(logprob_unanswerable)
'best_exact': (predictions, references), # Best exact match (with varying threshold)
'best_f1': (predictions, references), # Best F1 (with varying threshold) predictions = {
} "id": doc["id"],
"prediction_text": continuation,
def aggregation(self): "no_answer_probability": no_answer_probability,
""" }
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are references = {
functions that aggregate a list of metrics "id": doc["id"],
""" "answers": doc["answers"],
return { }
'exact': partial(_squad_agg, 'exact'), # Exact match (the normalized answer exactly match the gold answer)
'f1': partial(_squad_agg, 'f1'), # The F-score of predicted tokens versus the gold answer return {
'HasAns_exact': partial(_squad_agg, 'HasAns_exact'), # Exact match (the normalized answer exactly match the gold answer) "exact": (
'HasAns_f1': partial(_squad_agg, 'HasAns_f1'), # The F-score of predicted tokens versus the gold answer predictions,
'NoAns_exact': partial(_squad_agg, 'NoAns_exact'), # Exact match (the normalized answer exactly match the gold answer) references,
'NoAns_f1': partial(_squad_agg, 'NoAns_f1'), # The F-score of predicted tokens versus the gold answer ), # Exact match (the normalized answer exactly match the gold answer)
'best_exact': partial(_squad_agg, 'best_exact'), # Best exact match (with varying threshold) "f1": (
'best_f1': partial(_squad_agg, 'best_f1'), # Best F1 (with varying threshold) predictions,
} references,
), # The F-score of predicted tokens versus the gold answer
def higher_is_better(self): "HasAns_exact": (
""" predictions,
:returns: {str: bool} references,
A dictionary where keys are the names of submetrics and values are ), # Exact match (the normalized answer exactly match the gold answer)
whether a higher value of the submetric is better "HasAns_f1": (
""" predictions,
return { references,
'exact': True, # Exact match (the normalized answer exactly match the gold answer) ), # The F-score of predicted tokens versus the gold answer
'f1': True, # The F-score of predicted tokens versus the gold answer "NoAns_exact": (
'HasAns_exact': True, # Exact match (the normalized answer exactly match the gold answer) predictions,
'HasAns_f1': True, # The F-score of predicted tokens versus the gold answer references,
'NoAns_exact': True, # Exact match (the normalized answer exactly match the gold answer) ), # Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1': True, # The F-score of predicted tokens versus the gold answer "NoAns_f1": (
'best_exact': True, # Best exact match (with varying threshold) predictions,
'best_f1': True, # Best F1 (with varying threshold) references,
} ), # The F-score of predicted tokens versus the gold answer
"best_exact": (
predictions,
references,
), # Best exact match (with varying threshold)
"best_f1": (predictions, references), # Best F1 (with varying threshold)
}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"exact": partial(
_squad_agg, "exact"
), # Exact match (the normalized answer exactly match the gold answer)
"f1": partial(
_squad_agg, "f1"
), # The F-score of predicted tokens versus the gold answer
"HasAns_exact": partial(
_squad_agg, "HasAns_exact"
), # Exact match (the normalized answer exactly match the gold answer)
"HasAns_f1": partial(
_squad_agg, "HasAns_f1"
), # The F-score of predicted tokens versus the gold answer
"NoAns_exact": partial(
_squad_agg, "NoAns_exact"
), # Exact match (the normalized answer exactly match the gold answer)
"NoAns_f1": partial(
_squad_agg, "NoAns_f1"
), # The F-score of predicted tokens versus the gold answer
"best_exact": partial(
_squad_agg, "best_exact"
), # Best exact match (with varying threshold)
"best_f1": partial(
_squad_agg, "best_f1"
), # Best F1 (with varying threshold)
}
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"exact": True, # Exact match (the normalized answer exactly match the gold answer)
"f1": True, # The F-score of predicted tokens versus the gold answer
"HasAns_exact": True, # Exact match (the normalized answer exactly match the gold answer)
"HasAns_f1": True, # The F-score of predicted tokens versus the gold answer
"NoAns_exact": True, # Exact match (the normalized answer exactly match the gold answer)
"NoAns_f1": True, # The F-score of predicted tokens versus the gold answer
"best_exact": True, # Best exact match (with varying threshold)
"best_f1": True, # Best F1 (with varying threshold)
}
...@@ -65,12 +65,27 @@ class StoryCloze(Task): ...@@ -65,12 +65,27 @@ class StoryCloze(Task):
return self.dataset["test"] return self.dataset["test"]
def doc_to_text(self, doc): def doc_to_text(self, doc):
return ' '.join([ return " ".join(
doc["input_sentence_1"], [
doc["input_sentence_2"], doc["input_sentence_1"],
doc["input_sentence_3"], doc["input_sentence_2"],
doc["input_sentence_4"], doc["input_sentence_3"],
]) doc["input_sentence_4"],
]
)
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return " ".join(
[
doc["input_sentence_1"],
doc["input_sentence_2"],
doc["input_sentence_3"],
doc["input_sentence_4"],
]
)
def doc_to_target(self, doc): def doc_to_target(self, doc):
clozes = [doc["sentence_quiz1"], doc["sentence_quiz2"]] clozes = [doc["sentence_quiz1"], doc["sentence_quiz2"]]
...@@ -78,7 +93,7 @@ class StoryCloze(Task): ...@@ -78,7 +93,7 @@ class StoryCloze(Task):
return " " + clozes[doc["answer_right_ending"] - 1] return " " + clozes[doc["answer_right_ending"] - 1]
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
:param doc: :param doc:
...@@ -89,10 +104,7 @@ class StoryCloze(Task): ...@@ -89,10 +104,7 @@ class StoryCloze(Task):
part of the document for `doc`. part of the document for `doc`.
""" """
clozes = [doc["sentence_quiz1"], doc["sentence_quiz2"]] clozes = [doc["sentence_quiz1"], doc["sentence_quiz2"]]
lls = [ lls = [rf.loglikelihood(ctx, " {}".format(choice))[0] for choice in clozes]
rf.loglikelihood(ctx, " {}".format(choice))[0]
for choice in clozes
]
return lls return lls
def process_results(self, doc, results): def process_results(self, doc, results):
...@@ -106,10 +118,8 @@ class StoryCloze(Task): ...@@ -106,10 +118,8 @@ class StoryCloze(Task):
The results of the requests created in construct_requests. The results of the requests created in construct_requests.
""" """
gold = doc["answer_right_ending"] - 1 gold = doc["answer_right_ending"] - 1
acc = 1. if np.argmax(results) == gold else 0. acc = 1.0 if np.argmax(results) == gold else 0.0
return { return {"acc": acc}
"acc": acc
}
def aggregation(self): def aggregation(self):
""" """
...@@ -117,9 +127,7 @@ class StoryCloze(Task): ...@@ -117,9 +127,7 @@ class StoryCloze(Task):
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics functions that aggregate a list of metrics
""" """
return { return {"acc": mean}
"acc": mean
}
def higher_is_better(self): def higher_is_better(self):
""" """
...@@ -127,9 +135,7 @@ class StoryCloze(Task): ...@@ -127,9 +135,7 @@ class StoryCloze(Task):
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better whether a higher value of the submetric is better
""" """
return { return {"acc": True}
"acc": True
}
class StoryCloze2016(StoryCloze): class StoryCloze2016(StoryCloze):
......
...@@ -56,14 +56,20 @@ class BoolQ(Task): ...@@ -56,14 +56,20 @@ class BoolQ(Task):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return f"{doc['passage']}\nQuestion: {doc['question']}?\nAnswer:" return f"{doc['passage']}\nQuestion: {doc['question']}?\nAnswer:"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["passage"]
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + yesno(doc['label']) return " " + yesno(doc["label"])
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
ll_yes, _ = rf.loglikelihood(ctx, ' yes') ll_yes, _ = rf.loglikelihood(ctx, " yes")
ll_no, _ = rf.loglikelihood(ctx, ' no') ll_no, _ = rf.loglikelihood(ctx, " no")
return ll_yes, ll_no return ll_yes, ll_no
...@@ -71,21 +77,15 @@ class BoolQ(Task): ...@@ -71,21 +77,15 @@ class BoolQ(Task):
ll_yes, ll_no = results ll_yes, ll_no = results
gold = doc["label"] gold = doc["label"]
acc = 1. if (ll_yes > ll_no) == gold else 0. acc = 1.0 if (ll_yes > ll_no) == gold else 0.0
return {"acc": acc}
return {
"acc": acc
}
def higher_is_better(self): def higher_is_better(self):
return { return {"acc": True}
"acc": True
}
def aggregation(self): def aggregation(self):
return { return {"acc": mean}
"acc": mean
}
class CommitmentBank(Task): class CommitmentBank(Task):
...@@ -123,27 +123,21 @@ class CommitmentBank(Task): ...@@ -123,27 +123,21 @@ class CommitmentBank(Task):
return " {}".format({0: "True", 1: "False", 2: "Neither"}[doc["label"]]) return " {}".format({0: "True", 1: "False", 2: "Neither"}[doc["label"]])
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
ll_true, _ = rf.loglikelihood(ctx, ' True') ll_true, _ = rf.loglikelihood(ctx, " True")
ll_false, _ = rf.loglikelihood(ctx, ' False') ll_false, _ = rf.loglikelihood(ctx, " False")
ll_neither, _ = rf.loglikelihood(ctx, ' Neither') ll_neither, _ = rf.loglikelihood(ctx, " Neither")
return ll_true, ll_false, ll_neither return ll_true, ll_false, ll_neither
def process_results(self, doc, results): def process_results(self, doc, results):
gold = doc["label"] gold = doc["label"]
pred = np.argmax(results) pred = np.argmax(results)
acc = 1. if pred == gold else 0. acc = 1.0 if pred == gold else 0.0
return {"acc": acc, "f1": (pred, gold)}
return {
"acc": acc,
"f1": (pred, gold)
}
def higher_is_better(self): def higher_is_better(self):
return { return {"acc": True, "f1": True}
"acc": True,
"f1": True
}
@classmethod @classmethod
def cb_multi_fi(cls, items): def cb_multi_fi(cls, items):
...@@ -155,7 +149,7 @@ class CommitmentBank(Task): ...@@ -155,7 +149,7 @@ class CommitmentBank(Task):
f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2) f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2)
avg_f1 = mean([f11, f12, f13]) avg_f1 = mean([f11, f12, f13])
return avg_f1 return avg_f1
def aggregation(self): def aggregation(self):
return { return {
"acc": mean, "acc": mean,
...@@ -201,7 +195,7 @@ class Copa(Task): ...@@ -201,7 +195,7 @@ class Copa(Task):
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
choice1 = " " + self.convert_choice(doc["choice1"]) choice1 = " " + self.convert_choice(doc["choice1"])
choice2 = " " + self.convert_choice(doc["choice2"]) choice2 = " " + self.convert_choice(doc["choice2"])
ll_choice1, _ = rf.loglikelihood(ctx, choice1) ll_choice1, _ = rf.loglikelihood(ctx, choice1)
ll_choice2, _ = rf.loglikelihood(ctx, choice2) ll_choice2, _ = rf.loglikelihood(ctx, choice2)
...@@ -210,21 +204,15 @@ class Copa(Task): ...@@ -210,21 +204,15 @@ class Copa(Task):
def process_results(self, doc, results): def process_results(self, doc, results):
gold = doc["label"] gold = doc["label"]
pred = np.argmax(results) pred = np.argmax(results)
acc = 1. if pred == gold else 0. acc = 1.0 if pred == gold else 0.0
return {"acc": acc}
return {
"acc": acc
}
def higher_is_better(self): def higher_is_better(self):
return { return {"acc": True}
"acc": True
}
def aggregation(self): def aggregation(self):
return { return {"acc": mean}
"acc": mean
}
@staticmethod @staticmethod
def convert_choice(choice): def convert_choice(choice):
...@@ -267,28 +255,22 @@ class MultiRC(Task): ...@@ -267,28 +255,22 @@ class MultiRC(Task):
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
true_choice = self.format_answer(answer=doc["answer"], label=True) true_choice = self.format_answer(answer=doc["answer"], label=True)
false_choice = self.format_answer(answer=doc["answer"], label=False) false_choice = self.format_answer(answer=doc["answer"], label=False)
ll_true_choice, _ = rf.loglikelihood(ctx, f' {true_choice}') ll_true_choice, _ = rf.loglikelihood(ctx, f" {true_choice}")
ll_false_choice, _ = rf.loglikelihood(ctx, f' {false_choice}') ll_false_choice, _ = rf.loglikelihood(ctx, f" {false_choice}")
return ll_true_choice, ll_false_choice return ll_true_choice, ll_false_choice
def process_results(self, doc, results): def process_results(self, doc, results):
ll_true_choice, ll_false_choice = results ll_true_choice, ll_false_choice = results
pred = ll_true_choice > ll_false_choice pred = ll_true_choice > ll_false_choice
return { return {"acc": (pred, doc)}
"acc": (pred, doc)
}
def higher_is_better(self): def higher_is_better(self):
return { return {"acc": True}
"acc": True
}
def aggregation(self): def aggregation(self):
return { return {"acc": acc_all}
"acc": acc_all
}
class ReCoRD(Task): class ReCoRD(Task):
...@@ -337,7 +319,7 @@ class ReCoRD(Task): ...@@ -337,7 +319,7 @@ class ReCoRD(Task):
@classmethod @classmethod
def format_answer(cls, query, entity): def format_answer(cls, query, entity):
return f' - {query}'.replace("@placeholder", entity) return f" - {query}".replace("@placeholder", entity)
def doc_to_target(self, doc): def doc_to_target(self, doc):
# We only output the first correct entity in a doc # We only output the first correct entity in a doc
...@@ -359,8 +341,12 @@ class ReCoRD(Task): ...@@ -359,8 +341,12 @@ class ReCoRD(Task):
prediction = doc["entities"][max_idx] prediction = doc["entities"][max_idx]
gold_label_set = doc["answers"] gold_label_set = doc["answers"]
f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set) f1 = metric_max_over_ground_truths(
em = metric_max_over_ground_truths(squad_metrics.compute_exact, prediction, gold_label_set) squad_metrics.compute_f1, prediction, gold_label_set
)
em = metric_max_over_ground_truths(
squad_metrics.compute_exact, prediction, gold_label_set
)
return { return {
"f1": f1, "f1": f1,
...@@ -403,19 +389,21 @@ class WordsInContext(Task): ...@@ -403,19 +389,21 @@ class WordsInContext(Task):
return self.dataset["validation"] return self.dataset["validation"]
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the" \ return (
" two sentences above?\nAnswer:".format( "Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the"
doc["sentence1"], " two sentences above?\nAnswer:".format(
doc["sentence2"], doc["sentence1"],
doc["sentence1"][doc["start1"]:doc["end1"]], doc["sentence2"],
) doc["sentence1"][doc["start1"] : doc["end1"]],
)
)
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " {}".format({0: "no", 1: "yes"}[doc["label"]]) return " {}".format({0: "no", 1: "yes"}[doc["label"]])
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
ll_yes, _ = rf.loglikelihood(ctx, ' yes') ll_yes, _ = rf.loglikelihood(ctx, " yes")
ll_no, _ = rf.loglikelihood(ctx, ' no') ll_no, _ = rf.loglikelihood(ctx, " no")
return ll_yes, ll_no return ll_yes, ll_no
...@@ -423,21 +411,15 @@ class WordsInContext(Task): ...@@ -423,21 +411,15 @@ class WordsInContext(Task):
ll_yes, ll_no = results ll_yes, ll_no = results
gold = doc["label"] gold = doc["label"]
acc = 1. if (ll_yes > ll_no) == gold else 0. acc = 1.0 if (ll_yes > ll_no) == gold else 0.0
return { return {"acc": acc}
"acc": acc
}
def higher_is_better(self): def higher_is_better(self):
return { return {"acc": True}
"acc": True
}
def aggregation(self): def aggregation(self):
return { return {"acc": mean}
"acc": mean
}
class SGWinogradSchemaChallenge(Task): class SGWinogradSchemaChallenge(Task):
...@@ -461,9 +443,7 @@ class SGWinogradSchemaChallenge(Task): ...@@ -461,9 +443,7 @@ class SGWinogradSchemaChallenge(Task):
if self._training_docs is None: if self._training_docs is None:
# GPT-3 Paper's format only uses positive examples for fewshot "training" # GPT-3 Paper's format only uses positive examples for fewshot "training"
self._training_docs = [ self._training_docs = [
doc for doc in doc for doc in self.dataset["train"] if doc["label"]
self.dataset["train"]
if doc["label"]
] ]
return self._training_docs return self._training_docs
...@@ -473,25 +453,25 @@ class SGWinogradSchemaChallenge(Task): ...@@ -473,25 +453,25 @@ class SGWinogradSchemaChallenge(Task):
def doc_to_text(self, doc): def doc_to_text(self, doc):
raw_passage = doc["text"] raw_passage = doc["text"]
# NOTE: HuggingFace span indices are word-based not character-based. # NOTE: HuggingFace span indices are word-based not character-based.
pre = " ".join(raw_passage.split()[:doc["span2_index"]]) pre = " ".join(raw_passage.split()[: doc["span2_index"]])
post = raw_passage[len(pre) + len(doc["span2_text"]) + 1:] post = raw_passage[len(pre) + len(doc["span2_text"]) + 1 :]
passage = general_detokenize(pre + " *{}*".format(doc['span2_text']) + post) passage = general_detokenize(pre + " *{}*".format(doc["span2_text"]) + post)
noun = doc["span1_text"] noun = doc["span1_text"]
pronoun = doc["span2_text"] pronoun = doc["span2_text"]
text = ( text = (
f"Passage: {passage}\n" f"Passage: {passage}\n"
+ f"Question: In the passage above, does the pronoun \"*{pronoun}*\" refer to \"*{noun}*\"?\n" + f'Question: In the passage above, does the pronoun "*{pronoun}*" refer to "*{noun}*"?\n'
+ "Answer:" + "Answer:"
) )
return text return text
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + yesno(doc['label']) return " " + yesno(doc["label"])
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
ll_yes, _ = rf.loglikelihood(ctx, ' yes') ll_yes, _ = rf.loglikelihood(ctx, " yes")
ll_no, _ = rf.loglikelihood(ctx, ' no') ll_no, _ = rf.loglikelihood(ctx, " no")
return ll_yes, ll_no return ll_yes, ll_no
...@@ -499,18 +479,12 @@ class SGWinogradSchemaChallenge(Task): ...@@ -499,18 +479,12 @@ class SGWinogradSchemaChallenge(Task):
ll_yes, ll_no = results ll_yes, ll_no = results
gold = doc["label"] gold = doc["label"]
acc = 1. if (ll_yes > ll_no) == gold else 0. acc = 1.0 if (ll_yes > ll_no) == gold else 0.0
return { return {"acc": acc}
"acc": acc
}
def higher_is_better(self): def higher_is_better(self):
return { return {"acc": True}
"acc": True
}
def aggregation(self): def aggregation(self):
return { return {"acc": mean}
"acc": mean
}
"""
SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference
https://arxiv.org/pdf/1808.05326.pdf
SWAG (Situations With Adversarial Generations) is an adversarial dataset
that consists of 113k multiple choice questions about grounded situations. Each
question is a video caption from LSMDC or ActivityNet Captions, with four answer
choices about what might happen next in the scene. The correct answer is the
(real) video caption for the next event in the video; the three incorrect
answers are adversarially generated and human verified, so as to fool machines
but not humans.
Homepage: https://rowanzellers.com/swag/
"""
from lm_eval.base import MultipleChoiceTask
_CITATION = """
@inproceedings{zellers2018swagaf,
title={SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference},
author={Zellers, Rowan and Bisk, Yonatan and Schwartz, Roy and Choi, Yejin},
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
year={2018}
}
"""
class SWAG(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "swag"
DATASET_NAME = "regular"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def training_docs(self):
if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs
def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])
def _process_doc(self, doc):
out_doc = {
"query": doc["startphrase"],
"choices": [doc["ending0"], doc["ending1"], doc["ending2"], doc["ending3"]],
"gold": int(doc["label"]),
}
return out_doc
def doc_to_text(self, doc):
return doc["query"]
"""
ToxiGen: A Large-Scale Machine-Generated Dataset for Adversarial and Implicit Hate Speech Detection
https://arxiv.org/abs/2203.09509
Classify input text as either hateful or not hateful.
Homepage: https://github.com/microsoft/TOXIGEN
"""
from lm_eval.base import MultipleChoiceTask
import numpy as np
import pandas as pd
_CITATION = """
@inproceedings{hartvigsen2022toxigen,
title={ToxiGen: A Large-Scale Machine-Generated Dataset for Implicit and Adversarial Hate Speech Detection},
author={Hartvigsen, Thomas and Gabriel, Saadia and Palangi, Hamid and Sap, Maarten and Ray, Dipankar and Kamar, Ece},
booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics},
year={2022}
}
"""
class ToxiGen(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "skg/toxigen-data"
DATASET_NAME = "annotated"
def has_training_docs(self):
return True
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
# We cache training documents in `self._training_docs` for faster
# few-shot processing. If the data is too large to fit in memory,
# return the training data as a generator instead of a list.
if self._training_docs is None:
dataset = self._preprocess_dataset("train")
self._training_docs = list(map(self._process_doc, dataset))
return self._training_docs
def test_docs(self):
if self.has_test_docs():
dataset = self._preprocess_dataset("test")
return map(self._process_doc, dataset)
def _preprocess_dataset(self, split: str):
"""Preprocess the dataset into a list of (text, label) tuples."""
d = pd.DataFrame(self.dataset[split])
text = d["text"]
labels = np.round(((d["toxicity_ai"] + d["toxicity_human"]) > 5.5), 0).astype(
np.int32
)
return [[x, y] for x, y in zip(text, labels)]
def _process_doc(self, doc):
return {
"query": doc[0],
"choices": ["No", "Yes"],
"gold": doc[1],
}
def doc_to_text(self, doc):
return f"Is the following statement hateful? Respond with either Yes or No. Statement: '{doc['query']}'"
...@@ -16,6 +16,20 @@ from lm_eval import metrics ...@@ -16,6 +16,20 @@ from lm_eval import metrics
from lm_eval.base import Task, rf from lm_eval.base import Task, rf
from typing import List from typing import List
try:
import nagisa
HAS_NAGISA = True
except ImportError:
HAS_NAGISA = False
try:
import jieba
HAS_JIEBA = True
except ImportError:
HAS_JIEBA = False
_CITATION = """ _CITATION = """
@inproceedings{post-2018-call, @inproceedings{post-2018-call,
...@@ -41,44 +55,65 @@ def create_tasks_from_benchmarks(benchmark_dict): ...@@ -41,44 +55,65 @@ def create_tasks_from_benchmarks(benchmark_dict):
:return: {task_name: task} :return: {task_name: task}
e.g. {wmt14-fr-en: Task, wmt16-de-en: Task} e.g. {wmt14-fr-en: Task, wmt16-de-en: Task}
""" """
def version_of(dataset, language_pair): def version_of(dataset, language_pair):
if language_pair[-2:] in ["zh", "ja"]: if language_pair[-2:] in ["zh", "ja"]:
return 1 # changed to use jieba/nagisa return 1 # changed to use jieba/nagisa
return 0 return 0
return { return {
f"{dataset}-{language_pair}": create_translation_task(dataset, language_pair, version_of(dataset, language_pair)) f"{dataset}-{language_pair}": create_translation_task(
dataset, language_pair, version_of(dataset, language_pair)
)
for dataset, language_pairs in benchmark_dict.items() for dataset, language_pairs in benchmark_dict.items()
for language_pair in language_pairs for language_pair in language_pairs
} }
######################################## ########################################
# Language Specifics # Language Specifics
######################################## ########################################
def zh_split(zh_text: List[str]) -> List[str]: def zh_split(zh_text: List[str]) -> List[str]:
"""Chinese splitting""" """Chinese splitting"""
import jieba if not HAS_JIEBA:
raise ImportError(
"Chinese text splitting requires the `jieba` package. "
"Please install it with:\npip install jieba"
)
return [" ".join(jieba.cut(txt.strip())) for txt in zh_text] return [" ".join(jieba.cut(txt.strip())) for txt in zh_text]
def ja_split(ja_text: List[str]) -> List[str]: def ja_split(ja_text: List[str]) -> List[str]:
"""Japanese splitting""" """Japanese splitting"""
import nagisa if not HAS_NAGISA:
raise ImportError(
"Japanese text splitting requires the `nagisa` package. "
"Please install it with:\npip install nagisa"
)
return [" ".join(nagisa.tagging(txt.strip()).words) for txt in ja_text] return [" ".join(nagisa.tagging(txt.strip()).words) for txt in ja_text]
NO_SPACE_LANG = {"zh": zh_split, "ja": ja_split} NO_SPACE_LANG = {"zh": zh_split, "ja": ja_split}
######################################## ########################################
# Tasks # Tasks
######################################## ########################################
def create_translation_task(dataset, language_pair, version=0): def create_translation_task(dataset, language_pair, version=0):
class TranslationTask(GeneralTranslationTask): class TranslationTask(GeneralTranslationTask):
VERSION = version VERSION = version
def __init__(self): def __init__(self):
super().__init__(dataset, language_pair) super().__init__(dataset, language_pair)
return TranslationTask return TranslationTask
class GeneralTranslationTask(Task): class GeneralTranslationTask(Task):
VERSION = 0 VERSION = 0
...@@ -92,8 +127,9 @@ class GeneralTranslationTask(Task): ...@@ -92,8 +127,9 @@ class GeneralTranslationTask(Task):
def download(self, data_dir=None, cache_dir=None, download_mode=None): def download(self, data_dir=None, cache_dir=None, download_mode=None):
# This caches in the users home dir automatically # This caches in the users home dir automatically
self.src_file, self.ref_file = \ self.src_file, self.ref_file = sacrebleu.download_test_set(
sacrebleu.download_test_set(self.sacrebleu_dataset, self.sacrebleu_language_pair) self.sacrebleu_dataset, self.sacrebleu_language_pair
)
self.src_data, self.ref_data = [ self.src_data, self.ref_data = [
[line.rstrip() for line in sacrebleu.smart_open(file)] [line.rstrip() for line in sacrebleu.smart_open(file)]
for file in (self.src_file, self.ref_file) for file in (self.src_file, self.ref_file)
...@@ -117,10 +153,9 @@ class GeneralTranslationTask(Task): ...@@ -117,10 +153,9 @@ class GeneralTranslationTask(Task):
:return: Iterable[obj] :return: Iterable[obj]
A iterable of any object, that doc_to_text can handle A iterable of any object, that doc_to_text can handle
""" """
return [{ return [
"src": src, {"src": src, "ref": ref} for src, ref in zip(self.src_data, self.ref_data)
"ref": ref ]
} for src, ref in zip(self.src_data, self.ref_data)]
def doc_to_text(self, doc): def doc_to_text(self, doc):
language_codes = self.sacrebleu_language_pair.split("-") language_codes = self.sacrebleu_language_pair.split("-")
...@@ -128,12 +163,18 @@ class GeneralTranslationTask(Task): ...@@ -128,12 +163,18 @@ class GeneralTranslationTask(Task):
tar_lang = code_to_language(language_codes[1]) tar_lang = code_to_language(language_codes[1])
return f"{src_lang} phrase: " + doc["src"] + f"\n{tar_lang} phrase:" return f"{src_lang} phrase: " + doc["src"] + f"\n{tar_lang} phrase:"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["src"]
def doc_to_target(self, doc): def doc_to_target(self, doc):
# This shows a single target, though there may be multiple targets in a lang test # This shows a single target, though there may be multiple targets in a lang test
return " " + doc["ref"] if isinstance(doc["ref"], str) else doc["ref"][0] return " " + doc["ref"] if isinstance(doc["ref"], str) else doc["ref"][0]
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
:param doc: :param doc:
...@@ -143,7 +184,7 @@ class GeneralTranslationTask(Task): ...@@ -143,7 +184,7 @@ class GeneralTranslationTask(Task):
language description, as well as the few shot examples, and the question language description, as well as the few shot examples, and the question
part of the document for `doc`. part of the document for `doc`.
""" """
return rf.greedy_until(ctx, ["\n"]) return rf.greedy_until(ctx, {'until': ["\n"]})
def process_results(self, doc, results): def process_results(self, doc, results):
# Add spaces between words for BLEU score calculation of target languages like Chinese # Add spaces between words for BLEU score calculation of target languages like Chinese
......
...@@ -29,7 +29,7 @@ _CITATION = """ ...@@ -29,7 +29,7 @@ _CITATION = """
class TriviaQA(Task): class TriviaQA(Task):
VERSION = 0 VERSION = 1
DATASET_PATH = inspect.getfile(lm_eval.datasets.triviaqa.triviaqa) DATASET_PATH = inspect.getfile(lm_eval.datasets.triviaqa.triviaqa)
DATASET_NAME = None DATASET_NAME = None
...@@ -43,10 +43,10 @@ class TriviaQA(Task): ...@@ -43,10 +43,10 @@ class TriviaQA(Task):
return False return False
def training_docs(self): def training_docs(self):
return self.dataset['train'] return self.dataset["train"]
def validation_docs(self): def validation_docs(self):
return self.dataset['validation'] return self.dataset["validation"]
def test_docs(self): def test_docs(self):
raise NotImplementedError() raise NotImplementedError()
...@@ -54,8 +54,14 @@ class TriviaQA(Task): ...@@ -54,8 +54,14 @@ class TriviaQA(Task):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return f"Question: {doc['question']}\nAnswer:" return f"Question: {doc['question']}\nAnswer:"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["question"]
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + doc['answer']['value'] return " " + doc["answer"]["value"]
def _remove_prefixes(self, aliases): def _remove_prefixes(self, aliases):
# Optimization: Remove any alias that has a strict prefix elsewhere in the list # Optimization: Remove any alias that has a strict prefix elsewhere in the list
...@@ -69,15 +75,13 @@ class TriviaQA(Task): ...@@ -69,15 +75,13 @@ class TriviaQA(Task):
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
ret = [] ret = []
for alias in self._remove_prefixes(doc['answer']['aliases']): for alias in self._remove_prefixes(doc["answer"]["aliases"]):
_, is_prediction = rf.loglikelihood(ctx, " " + alias) _, is_prediction = rf.loglikelihood(ctx, " " + alias)
ret.append(is_prediction) ret.append(is_prediction)
return ret return ret
def process_results(self, doc, results): def process_results(self, doc, results):
return { return {"acc": float(any(results))}
"acc": float(any(results))
}
def aggregation(self): def aggregation(self):
return { return {
...@@ -85,6 +89,4 @@ class TriviaQA(Task): ...@@ -85,6 +89,4 @@ class TriviaQA(Task):
} }
def higher_is_better(self): def higher_is_better(self):
return { return {"acc": True}
"acc": True
}
...@@ -19,16 +19,22 @@ we could try this? ...@@ -19,16 +19,22 @@ we could try this?
Homepage: https://github.com/sylinrl/TruthfulQA Homepage: https://github.com/sylinrl/TruthfulQA
""" """
import inspect
import numpy as np import numpy as np
import sacrebleu import sacrebleu
import datasets import datasets
import lm_eval.datasets.truthfulqa.truthfulqa
from rouge_score import rouge_scorer, scoring from rouge_score import rouge_scorer, scoring
from lm_eval.base import rf, Task from lm_eval.base import rf, Task
from lm_eval.metrics import mean from lm_eval.metrics import mean
try:
import bleurt
HAS_BLEURT = True
except ImportError:
HAS_BLEURT = False
_CITATION = """ _CITATION = """
@misc{lin2021truthfulqa, @misc{lin2021truthfulqa,
title={TruthfulQA: Measuring How Models Mimic Human Falsehoods}, title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
...@@ -60,7 +66,7 @@ QA_PROMPT = ( ...@@ -60,7 +66,7 @@ QA_PROMPT = (
class TruthfulQAMultipleChoice(Task): class TruthfulQAMultipleChoice(Task):
VERSION = 1 VERSION = 1
DATASET_PATH = inspect.getfile(lm_eval.datasets.truthfulqa.truthfulqa) DATASET_PATH = "truthful_qa"
DATASET_NAME = "multiple_choice" DATASET_NAME = "multiple_choice"
def has_training_docs(self): def has_training_docs(self):
...@@ -82,22 +88,29 @@ class TruthfulQAMultipleChoice(Task): ...@@ -82,22 +88,29 @@ class TruthfulQAMultipleChoice(Task):
raise NotImplementedError() raise NotImplementedError()
def doc_to_text(self, doc): def doc_to_text(self, doc):
return QA_PROMPT + "\n\nQ: " + doc['question'] + "\nA:" return QA_PROMPT + "\n\nQ: " + doc["question"] + "\nA:"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["question"]
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " return " "
def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None): def fewshot_context(
assert num_fewshot == 0, "TruthfulQA is intended only for the zero-shot setting." self, doc, num_fewshot, provide_description=None, rnd=None, description=None
):
assert (
num_fewshot == 0
), "TruthfulQA is intended only for the zero-shot setting."
return super().fewshot_context( return super().fewshot_context(
doc=doc, doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
num_fewshot=num_fewshot,
rnd=rnd,
description=description
) )
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
:param doc: :param doc:
...@@ -107,11 +120,15 @@ class TruthfulQAMultipleChoice(Task): ...@@ -107,11 +120,15 @@ class TruthfulQAMultipleChoice(Task):
language description, as well as the few shot examples, and the question language description, as well as the few shot examples, and the question
part of the document for `doc`. part of the document for `doc`.
""" """
def get_lls(targets): def get_lls(targets):
return [rf.loglikelihood(ctx, " " + t)[0] for t in targets] return [rf.loglikelihood(ctx, " " + t)[0] for t in targets]
# MC1 and MC2 targets are not always the same set of strings so we collect # MC1 and MC2 targets are not always the same set of strings so we collect
# likelihoods separately for simpler processing. # likelihoods separately for simpler processing.
return get_lls(doc['mc1_targets']["choices"]) + get_lls(doc['mc2_targets']["choices"]) return get_lls(doc["mc1_targets"]["choices"]) + get_lls(
doc["mc2_targets"]["choices"]
)
def process_results(self, doc, results): def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a """Take a single document and the LM results and evaluates, returning a
...@@ -123,46 +140,44 @@ class TruthfulQAMultipleChoice(Task): ...@@ -123,46 +140,44 @@ class TruthfulQAMultipleChoice(Task):
:param results: :param results:
The results of the requests created in construct_requests. The results of the requests created in construct_requests.
""" """
def mc1(lls): def mc1(lls):
# The gold answers in `mc1_targets` are always first (index = `0`). # The gold answers in `mc1_targets` are always first (index = `0`).
return np.argmax(lls) == 0 return np.argmax(lls) == 0
def mc2(lls): def mc2(lls):
# Split on the first `0` as everything before it is true (`1`). # Split on the first `0` as everything before it is true (`1`).
split_idx = list(doc['mc2_targets']["labels"]).index(0) split_idx = list(doc["mc2_targets"]["labels"]).index(0)
# Compute the normalized probability mass for the correct answer. # Compute the normalized probability mass for the correct answer.
ll_true, ll_false = lls[:split_idx], lls[split_idx:] ll_true, ll_false = lls[:split_idx], lls[split_idx:]
p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false)) p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))
p_true = p_true / (sum(p_true) + sum(p_false)) p_true = p_true / (sum(p_true) + sum(p_false))
return sum(p_true) return sum(p_true)
split_idx = len(doc['mc1_targets']["choices"]) split_idx = len(doc["mc1_targets"]["choices"])
mc1_lls, mc2_lls = results[:split_idx], results[split_idx:] mc1_lls, mc2_lls = results[:split_idx], results[split_idx:]
return { return {"mc1": mc1(mc1_lls), "mc2": mc2(mc2_lls)}
"mc1": mc1(mc1_lls),
"mc2": mc2(mc2_lls)
}
def aggregation(self): def aggregation(self):
return { return {"mc1": mean, "mc2": mean}
"mc1": mean,
"mc2": mean
}
def higher_is_better(self): def higher_is_better(self):
return { return {"mc1": True, "mc2": True}
"mc1": True,
"mc2": True
}
class TruthfulQAGeneration(Task): class TruthfulQAGeneration(Task):
VERSION = 1 VERSION = 1
DATASET_PATH = inspect.getfile(lm_eval.datasets.truthfulqa.truthfulqa) DATASET_PATH = "truthful_qa"
DATASET_NAME = "generation" DATASET_NAME = "generation"
def __init__(self): def __init__(self):
super().__init__() super().__init__()
if not HAS_BLEURT:
raise ImportError(
"`TruthfulQAGeneration` requires the `bleurt` package. Please install it with:\n"
"pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt"
"\nWARNING: Installing any other version of bleurt may result in different results."
)
self.bleurt = datasets.load_metric("bleurt") self.bleurt = datasets.load_metric("bleurt")
def has_training_docs(self): def has_training_docs(self):
...@@ -183,44 +198,45 @@ class TruthfulQAGeneration(Task): ...@@ -183,44 +198,45 @@ class TruthfulQAGeneration(Task):
answer = answer.strip() answer = answer.strip()
if len(answer): if len(answer):
# Add a period after all answers. # Add a period after all answers.
if answer[-1] != '.': if answer[-1] != ".":
formatted_answers.append(answer + '.') formatted_answers.append(answer + ".")
else: else:
formatted_answers.append(answer) formatted_answers.append(answer)
return formatted_answers return formatted_answers
def validation_docs(self): def validation_docs(self):
for doc in self.dataset["validation"]: for doc in self.dataset["validation"]:
incorrect_answers = self._format_answers(doc['incorrect_answers']) incorrect_answers = self._format_answers(doc["incorrect_answers"])
correct_answers = self._format_answers(doc['correct_answers']) correct_answers = self._format_answers(doc["correct_answers"])
if "I have no comment." not in correct_answers: if "I have no comment." not in correct_answers:
correct_answers.append("I have no comment.") correct_answers.append("I have no comment.")
yield { yield {
'question': doc['question'].strip(), "question": doc["question"].strip(),
'correct_answers': correct_answers, "correct_answers": correct_answers,
'incorrect_answers': incorrect_answers "incorrect_answers": incorrect_answers,
} }
def test_docs(self): def test_docs(self):
raise NotImplementedError() raise NotImplementedError()
def doc_to_text(self, doc): def doc_to_text(self, doc):
return QA_PROMPT + "\n\nQ: " + doc['question'] return QA_PROMPT + "\n\nQ: " + doc["question"]
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " return " "
def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None): def fewshot_context(
assert num_fewshot == 0, "TruthfulQA is intended only for the zero-shot setting." self, doc, num_fewshot, provide_description=None, rnd=None, description=None
):
assert (
num_fewshot == 0
), "TruthfulQA is intended only for the zero-shot setting."
return super().fewshot_context( return super().fewshot_context(
doc=doc, doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
num_fewshot=num_fewshot,
rnd=rnd,
description=description
) )
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
:param doc: :param doc:
...@@ -231,7 +247,7 @@ class TruthfulQAGeneration(Task): ...@@ -231,7 +247,7 @@ class TruthfulQAGeneration(Task):
part of the document for `doc`. part of the document for `doc`.
""" """
# TODO: Find a way to cap the number of generated tokens to `50` as in the official implementation. # TODO: Find a way to cap the number of generated tokens to `50` as in the official implementation.
completion = rf.greedy_until(ctx, ['.']) completion = rf.greedy_until(ctx, {'until': ["."]})
return completion return completion
def process_results(self, doc, results): def process_results(self, doc, results):
...@@ -245,18 +261,18 @@ class TruthfulQAGeneration(Task): ...@@ -245,18 +261,18 @@ class TruthfulQAGeneration(Task):
The results of the requests created in construct_requests. The results of the requests created in construct_requests.
""" """
completion = results[0].strip() completion = results[0].strip()
true_refs, false_refs = doc['correct_answers'], doc['incorrect_answers'] true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
all_refs = true_refs + false_refs all_refs = true_refs + false_refs
# Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures. # Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.
# BLEURT # BLEURT
bleurt_scores_true = self.bleurt.compute( bleurt_scores_true = self.bleurt.compute(
predictions=[completion] * len(true_refs), predictions=[completion] * len(true_refs), references=true_refs
references=true_refs)['scores'] )["scores"]
bleurt_scores_false = self.bleurt.compute( bleurt_scores_false = self.bleurt.compute(
predictions=[completion] * len(false_refs), predictions=[completion] * len(false_refs), references=false_refs
references=false_refs)['scores'] )["scores"]
bleurt_correct = max(bleurt_scores_true) bleurt_correct = max(bleurt_scores_true)
bleurt_incorrect = max(bleurt_scores_false) bleurt_incorrect = max(bleurt_scores_false)
bleurt_max = bleurt_correct bleurt_max = bleurt_correct
...@@ -265,8 +281,8 @@ class TruthfulQAGeneration(Task): ...@@ -265,8 +281,8 @@ class TruthfulQAGeneration(Task):
# BLEU # BLEU
bleu_scores = [self.bleu([[ref]], [completion]) for ref in all_refs] bleu_scores = [self.bleu([[ref]], [completion]) for ref in all_refs]
bleu_correct = np.nanmax(bleu_scores[:len(true_refs)]) bleu_correct = np.nanmax(bleu_scores[: len(true_refs)])
bleu_incorrect = np.nanmax(bleu_scores[len(true_refs):]) bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :])
bleu_max = bleu_correct bleu_max = bleu_correct
bleu_diff = bleu_correct - bleu_incorrect bleu_diff = bleu_correct - bleu_incorrect
bleu_acc = int(bleu_correct > bleu_incorrect) bleu_acc = int(bleu_correct > bleu_incorrect)
...@@ -274,23 +290,23 @@ class TruthfulQAGeneration(Task): ...@@ -274,23 +290,23 @@ class TruthfulQAGeneration(Task):
# ROUGE-N # ROUGE-N
rouge_scores = [self.rouge([ref], [completion]) for ref in all_refs] rouge_scores = [self.rouge([ref], [completion]) for ref in all_refs]
# ROUGE-1 # ROUGE-1
rouge1_scores = [score['rouge1'] for score in rouge_scores] rouge1_scores = [score["rouge1"] for score in rouge_scores]
rouge1_correct = np.nanmax(rouge1_scores[:len(true_refs)]) rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])
rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs):]) rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])
rouge1_max = rouge1_correct rouge1_max = rouge1_correct
rouge1_diff = rouge1_correct - rouge1_incorrect rouge1_diff = rouge1_correct - rouge1_incorrect
rouge1_acc = int(rouge1_correct > rouge1_incorrect) rouge1_acc = int(rouge1_correct > rouge1_incorrect)
# ROUGE-2 # ROUGE-2
rouge2_scores = [score['rouge2'] for score in rouge_scores] rouge2_scores = [score["rouge2"] for score in rouge_scores]
rouge2_correct = np.nanmax(rouge2_scores[:len(true_refs)]) rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])
rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs):]) rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])
rouge2_max = rouge2_correct rouge2_max = rouge2_correct
rouge2_diff = rouge2_correct - rouge2_incorrect rouge2_diff = rouge2_correct - rouge2_incorrect
rouge2_acc = int(rouge2_correct > rouge2_incorrect) rouge2_acc = int(rouge2_correct > rouge2_incorrect)
# ROUGE-L # ROUGE-L
rougeL_scores = [score['rougeLsum'] for score in rouge_scores] rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
rougeL_correct = np.nanmax(rougeL_scores[:len(true_refs)]) rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])
rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs):]) rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])
rougeL_max = rougeL_correct rougeL_max = rougeL_correct
rougeL_diff = rougeL_correct - rougeL_incorrect rougeL_diff = rougeL_correct - rougeL_incorrect
rougeL_acc = int(rougeL_correct > rougeL_incorrect) rougeL_acc = int(rougeL_correct > rougeL_incorrect)
...@@ -299,19 +315,15 @@ class TruthfulQAGeneration(Task): ...@@ -299,19 +315,15 @@ class TruthfulQAGeneration(Task):
"bleurt_max": bleurt_max, "bleurt_max": bleurt_max,
"bleurt_acc": bleurt_acc, "bleurt_acc": bleurt_acc,
"bleurt_diff": bleurt_diff, "bleurt_diff": bleurt_diff,
"bleu_max": bleu_max, "bleu_max": bleu_max,
"bleu_acc": bleu_acc, "bleu_acc": bleu_acc,
"bleu_diff": bleu_diff, "bleu_diff": bleu_diff,
"rouge1_max": rouge1_max, "rouge1_max": rouge1_max,
"rouge1_acc": rouge1_acc, "rouge1_acc": rouge1_acc,
"rouge1_diff": rouge1_diff, "rouge1_diff": rouge1_diff,
"rouge2_max": rouge2_max, "rouge2_max": rouge2_max,
"rouge2_acc": rouge2_acc, "rouge2_acc": rouge2_acc,
"rouge2_diff": rouge2_diff, "rouge2_diff": rouge2_diff,
"rougeL_max": rougeL_max, "rougeL_max": rougeL_max,
"rougeL_acc": rougeL_acc, "rougeL_acc": rougeL_acc,
"rougeL_diff": rougeL_diff, "rougeL_diff": rougeL_diff,
...@@ -322,19 +334,15 @@ class TruthfulQAGeneration(Task): ...@@ -322,19 +334,15 @@ class TruthfulQAGeneration(Task):
"bleurt_max": mean, "bleurt_max": mean,
"bleurt_acc": mean, "bleurt_acc": mean,
"bleurt_diff": mean, "bleurt_diff": mean,
"bleu_max": mean, "bleu_max": mean,
"bleu_acc": mean, "bleu_acc": mean,
"bleu_diff": mean, "bleu_diff": mean,
"rouge1_max": mean, "rouge1_max": mean,
"rouge1_acc": mean, "rouge1_acc": mean,
"rouge1_diff": mean, "rouge1_diff": mean,
"rouge2_max": mean, "rouge2_max": mean,
"rouge2_acc": mean, "rouge2_acc": mean,
"rouge2_diff": mean, "rouge2_diff": mean,
"rougeL_max": mean, "rougeL_max": mean,
"rougeL_acc": mean, "rougeL_acc": mean,
"rougeL_diff": mean, "rougeL_diff": mean,
...@@ -345,19 +353,15 @@ class TruthfulQAGeneration(Task): ...@@ -345,19 +353,15 @@ class TruthfulQAGeneration(Task):
"bleurt_max": True, "bleurt_max": True,
"bleurt_acc": True, "bleurt_acc": True,
"bleurt_diff": True, "bleurt_diff": True,
"bleu_max": True, "bleu_max": True,
"bleu_acc": True, "bleu_acc": True,
"bleu_diff": True, "bleu_diff": True,
"rouge1_max": True, "rouge1_max": True,
"rouge1_acc": True, "rouge1_acc": True,
"rouge1_diff": True, "rouge1_diff": True,
"rouge2_max": True, "rouge2_max": True,
"rouge2_acc": True, "rouge2_acc": True,
"rouge2_diff": True, "rouge2_diff": True,
"rougeL_max": True, "rougeL_max": True,
"rougeL_acc": True, "rougeL_acc": True,
"rougeL_diff": True, "rougeL_diff": True,
...@@ -381,7 +385,7 @@ class TruthfulQAGeneration(Task): ...@@ -381,7 +385,7 @@ class TruthfulQAGeneration(Task):
force=False, force=False,
lowercase=False, lowercase=False,
tokenize="intl", tokenize="intl",
use_effective_order=False use_effective_order=False,
).score ).score
return score return score
...@@ -398,9 +402,11 @@ class TruthfulQAGeneration(Task): ...@@ -398,9 +402,11 @@ class TruthfulQAGeneration(Task):
rouge_types = ["rouge1", "rouge2", "rougeLsum"] rouge_types = ["rouge1", "rouge2", "rougeLsum"]
scorer = rouge_scorer.RougeScorer(rouge_types) scorer = rouge_scorer.RougeScorer(rouge_types)
# Add newlines between sentences to correctly compute `rougeLsum`. # Add newlines between sentences to correctly compute `rougeLsum`.
def _prepare_summary(summary): def _prepare_summary(summary):
summary = summary.replace(" . ", ".\n") summary = summary.replace(" . ", ".\n")
return summary return summary
# Accumulate confidence intervals. # Accumulate confidence intervals.
aggregator = scoring.BootstrapAggregator() aggregator = scoring.BootstrapAggregator()
for ref, pred in zip(refs, preds): for ref, pred in zip(refs, preds):
...@@ -408,4 +414,4 @@ class TruthfulQAGeneration(Task): ...@@ -408,4 +414,4 @@ class TruthfulQAGeneration(Task):
pred = _prepare_summary(pred) pred = _prepare_summary(pred)
aggregator.add_scores(scorer.score(ref, pred)) aggregator.add_scores(scorer.score(ref, pred))
result = aggregator.aggregate() result = aggregator.aggregate()
return {type: result[type].mid.fmeasure*100 for type in rouge_types} return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
...@@ -49,29 +49,29 @@ class WordUnscrambleTask(Task): ...@@ -49,29 +49,29 @@ class WordUnscrambleTask(Task):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["context"] return doc["context"]
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["context"]
def doc_to_target(self, doc): def doc_to_target(self, doc):
return doc["completion"] return doc["completion"]
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
completion = rf.greedy_until(ctx, ["\n"]) completion = rf.greedy_until(ctx, {'until': ["\n"]})
return completion return completion
def process_results(self, doc, results): def process_results(self, doc, results):
pred = results[0] pred = results[0]
gold = doc["completion"] gold = doc["completion"]
return { return {"acc": int(pred == gold)}
"acc": int(pred == gold)
}
def aggregation(self): def aggregation(self):
return { return {"acc": mean}
"acc": mean
}
def higher_is_better(self): def higher_is_better(self):
return { return {"acc": True}
"acc": True
}
class Anagrams1(WordUnscrambleTask): class Anagrams1(WordUnscrambleTask):
......
...@@ -54,14 +54,20 @@ class WebQs(Task): ...@@ -54,14 +54,20 @@ class WebQs(Task):
return self.dataset["test"] return self.dataset["test"]
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "Question: " + doc['question'] + '\nAnswer:' return "Question: " + doc["question"] + "\nAnswer:"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["question"]
def doc_to_target(self, doc): def doc_to_target(self, doc):
# this picks one answer to be the "correct" one, despite sometimes # this picks one answer to be the "correct" one, despite sometimes
# multiple correct answers being possible. # multiple correct answers being possible.
# TODO: make sure we're actually handling multi-answer correctly # TODO: make sure we're actually handling multi-answer correctly
return " " + doc['answers'][0] return " " + doc["answers"][0]
def _remove_prefixes(self, aliases): def _remove_prefixes(self, aliases):
# Optimization: Remove any alias that has a strict prefix elsewhere in the list # Optimization: Remove any alias that has a strict prefix elsewhere in the list
# we can do this because if the prefix is acceptable by isgreedy, we can stop looking # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
...@@ -75,15 +81,13 @@ class WebQs(Task): ...@@ -75,15 +81,13 @@ class WebQs(Task):
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
ret = [] ret = []
for alias in self._remove_prefixes(doc['answers']): for alias in self._remove_prefixes(doc["answers"]):
_, is_prediction = rf.loglikelihood(ctx, " " + alias) _, is_prediction = rf.loglikelihood(ctx, " " + alias)
ret.append(is_prediction) ret.append(is_prediction)
return ret return ret
def process_results(self, doc, results): def process_results(self, doc, results):
return { return {"acc": float(any(results))}
"acc": float(any(results))
}
def aggregation(self): def aggregation(self):
return { return {
...@@ -91,6 +95,4 @@ class WebQs(Task): ...@@ -91,6 +95,4 @@ class WebQs(Task):
} }
def higher_is_better(self): def higher_is_better(self):
return { return {"acc": True}
"acc": True
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment