mc_taco.py

"""
“Going on a vacation” takes longer than “Going for a walk”:
A Study of Temporal Commonsense Understanding
https://arxiv.org/pdf/1909.03065.pdf

WARNING: Running this task with a `--limit` arg will give misleading results! The 
corresponding dataset is structured such that each multiple-choice-question gathered
by the authors is split into question-option pairs, where each such pair gets 
siloed into an individual document for plausibility testing. Because the harness
shuffles these documents, setting `--limit` will likely "cut off" certain candidate
answers. This is a problem because the task metrics require an exhaustive evaluation 
of a question's options (see section 4 of the for details paper).

@inproceedings{ZKNR19,
    author = {Ben Zhou, Daniel Khashabi, Qiang Ning and Dan Roth},
    title = {“Going on a vacation” takes longer than “Going for a walk”: A Study of Temporal Commonsense Understanding },
    booktitle = {EMNLP},
    year = {2019},
}
"""

import numpy as np
from lm_eval.base import rf
from collections import defaultdict
from . common import HFTask


class MCTACO(HFTask):
    VERSION = 0
    DATASET_PATH = "mc_taco"
    DATASET_NAME = None

    def has_training_docs(self):
        return False

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
        return "Determine whether the candidate answer is plausible (\"yes\") or not (\"no\")"

    def doc_to_text(self, doc):
        return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
            f"Answer: {doc['answer']}\nPlausible:"

    def doc_to_target(self, doc):
        return " " + ["no", "yes"][doc['label']]

    def construct_requests(self, doc, ctx):
        """ Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

        :param doc:
            The document as returned from training_docs, validation_docs, or test_docs.
        :param ctx: str
            The context string, generated by fewshot_context. This includes the natural
            language description, as well as the few shot examples, and the question
            part of the document for `doc`.
        """
        ll_no, _ = rf.loglikelihood(ctx, " no")
        ll_yes, _ = rf.loglikelihood(ctx, " yes")
        return ll_no, ll_yes

    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a
        dict where keys are the names of submetrics and values are the values of
        the metric for that one document

        :param doc:
            The document as returned from training_docs, validation_docs, or test_docs.
        :param results:
            The results of the requests created in construct_requests.
        """
        ll_no, ll_yes = results
        gold = doc['label']
        pred = int(ll_yes > ll_no)
        question_id = self._question2id(doc)
        items = (gold, pred, question_id)
        return {
            "em": items,
            "f1": items
        }

    def _question2id(self, doc):
        """ Returns an identifier for the question in the given document. """
        return " ".join([doc['sentence'], doc['question']])

    def aggregation(self):
        return {
            "f1": f1,
            "em": exact_match,
        }

    def higher_is_better(self):
        return {
            "f1": True,
            "em": True,
        }


def exact_match(items):
    """
    Counts a question as correct if the model accurately classifies the plausibility
    of an answer for all candidate answers. See section 4 "Evaluation Metrics" in the paper.
    """
    results = list(zip(*items))
    accuracies = defaultdict(list)
    for gold, pred, question in zip(results[0], results[1], results[2]):
        accuracies[question].append(pred == gold)
    return np.mean([int(all(accs)) for accs in accuracies.values()])


def f1(items):
    """ See section 4 "Evaluation Metrics" in the paper about the F1 metric used. """
    results = list(zip(*items))
    # Group the positive ("yes" = 1) golds and predictions by question.
    gold_positives, pred_positives = defaultdict(list), defaultdict(list)
    for gold, pred, question in zip(results[0], results[1], results[2]):
        gold_positives[question].append(gold)
        pred_positives[question].append(pred)
    f1 = []
    for question in gold_positives.keys():
        gp, pp = sum(gold_positives[question]), sum(pred_positives[question])
        tp = sum(np.logical_and(gold_positives[question], pred_positives[question]))
        p = tp / pp if pp > 0.0 else 1.0
        r = tp / gp if gp > 0.0 else 1.0
        if p + r > 0.0:
            f1.append(2. * (p * r) / (p + r))
    return np.mean(f1)