Unverified Commit 6fa1a4fa authored by Leo Gao's avatar Leo Gao Committed by GitHub
Browse files

Merge pull request #201 from jon-tow/mc-taco

Implement `MC-TACO`
parents 3e1bcf71 9676b93f
...@@ -22,6 +22,7 @@ from . import lambada ...@@ -22,6 +22,7 @@ from . import lambada
from . import race from . import race
from . import piqa from . import piqa
from . import prost from . import prost
from . import mc_taco
from . import triviaqa from . import triviaqa
from . import pubmedqa from . import pubmedqa
from . import sciq from . import sciq
...@@ -104,6 +105,7 @@ TASK_REGISTRY = { ...@@ -104,6 +105,7 @@ TASK_REGISTRY = {
"piqa": piqa.PiQA, "piqa": piqa.PiQA,
"prost": prost.PROST, "prost": prost.PROST,
"mc_taco": mc_taco.MCTACO,
# Science related # Science related
"pubmedqa" : pubmedqa.Pubmed_QA, "pubmedqa" : pubmedqa.Pubmed_QA,
......
"""
“Going on a vacation” takes longer than “Going for a walk”:
A Study of Temporal Commonsense Understanding
https://arxiv.org/pdf/1909.03065.pdf
WARNING: Running this task with a `--limit` arg will give misleading results! The
corresponding dataset is structured such that each multiple-choice-question gathered
by the authors is split into question-option pairs, where each such pair gets
siloed into an individual document for plausibility testing. Because the harness
shuffles these documents, setting `--limit` will likely "cut off" certain candidate
answers. This is a problem because the task's metrics require an exhaustive evaluation
of a question's options. See section 4 of the paper for details.
@inproceedings{ZKNR19,
author = {Ben Zhou, Daniel Khashabi, Qiang Ning and Dan Roth},
title = {“Going on a vacation” takes longer than “Going for a walk”: A Study of Temporal Commonsense Understanding },
booktitle = {EMNLP},
year = {2019},
}
"""
import numpy as np
from lm_eval.base import rf
from collections import defaultdict
from . common import HFTask
class MCTACO(HFTask):
VERSION = 0
DATASET_PATH = "mc_taco"
DATASET_NAME = None
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Determine whether the candidate answer is plausible (\"yes\") or not (\"no\")"
def doc_to_text(self, doc):
return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
f"Answer: {doc['answer']}\nPlausible:"
def doc_to_target(self, doc):
return " " + ["no", "yes"][doc['label']]
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
ll_no, _ = rf.loglikelihood(ctx, " no")
ll_yes, _ = rf.loglikelihood(ctx, " yes")
return ll_no, ll_yes
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
ll_no, ll_yes = results
gold = doc['label']
pred = int(ll_yes > ll_no)
question_id = self._question2id(doc)
items = (gold, pred, question_id)
return {
"em": items,
"f1": items
}
def _question2id(self, doc):
""" Returns an identifier for the question in the given document. """
return " ".join([doc['sentence'], doc['question']])
def aggregation(self):
return {
"f1": f1,
"em": exact_match,
}
def higher_is_better(self):
return {
"f1": True,
"em": True,
}
def exact_match(items):
"""
Counts a question as correct if the model accurately classifies the plausibility
of an answer for all candidate answers. See section 4 "Evaluation Metrics" in the paper.
"""
results = list(zip(*items))
accuracies = defaultdict(list)
for gold, pred, question in zip(results[0], results[1], results[2]):
accuracies[question].append(pred == gold)
return np.mean([int(all(accs)) for accs in accuracies.values()])
def f1(items):
""" See section 4 "Evaluation Metrics" in the paper about the F1 metric used. """
results = list(zip(*items))
# Group the positive ("yes" = 1) golds and predictions by question.
gold_positives, pred_positives = defaultdict(list), defaultdict(list)
for gold, pred, question in zip(results[0], results[1], results[2]):
gold_positives[question].append(gold)
pred_positives[question].append(pred)
f1 = []
for question in gold_positives.keys():
gp, pp = sum(gold_positives[question]), sum(pred_positives[question])
tp = sum(np.logical_and(gold_positives[question], pred_positives[question]))
p = tp / pp if pp > 0.0 else 1.0
r = tp / gp if gp > 0.0 else 1.0
if p + r > 0.0:
f1.append(2. * (p * r) / (p + r))
return np.mean(f1)
1811808ef05afd5f30ffc3471622a3dd7a1b681b17a2f7616695ad6b2a45943c
\ No newline at end of file
{"results": {"mc_taco": {"em": 0.07732732732732733, "f1": 0.41600515965511614}}, "versions": {"mc_taco": 0}}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment