Merge branch 'master' into lambada-multilingual

cf5823cf · sdtblck · GitHub · 11f0e6d8 · 6fa1a4fa · cf5823cf
Unverified Commit cf5823cf authored Jun 21, 2021 by sdtblck Committed by GitHub Jun 21, 2021
9 changed files
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -22,6 +22,7 @@ from . import lambada
 from . import race
 from . import piqa
 from . import prost
+from . import mc_taco
 from . import triviaqa
 from . import pubmedqa
 from . import sciq
@@ -41,6 +42,7 @@ from . import lambada_cloze
 from . import pile
 from . import wikitext
 from . import lambada_multilingual
+from . import mutual

 ########################################
 # Translation tasks
@@ -99,7 +101,7 @@ TASK_REGISTRY = {
    "lambada": lambada.LAMBADA,
    "lambada_cloze": lambada_cloze.LAMBADA_cloze,
    
-    # multlingual lambada
+    # multilingual lambada
    **lambada_multilingual.construct_tasks(),

    "wikitext": wikitext.WikiText,
@@ -108,6 +110,7 @@ TASK_REGISTRY = {

    "piqa": piqa.PiQA,
    "prost": prost.PROST,
+    "mc_taco": mc_taco.MCTACO,

    # Science related
    "pubmedqa" : pubmedqa.Pubmed_QA,
@@ -144,6 +147,10 @@ TASK_REGISTRY = {
    "ethics_utilitarianism": hendrycks_ethics.EthicsUtilitarianism,
    "ethics_virtue": hendrycks_ethics.EthicsVirtue,

+    # dialogue
+    "mutual": mutual.MuTual,
+    "mutual_plus": mutual.MuTualPlus,
+
    # math
    "math_algebra": hendrycks_math.MathAlgebra,
    "math_counting_and_prob": hendrycks_math.MathCountingAndProbability,

--- a/lm_eval/tasks/mc_taco.py
+++ b/lm_eval/tasks/mc_taco.py
+"""
+“Going on a vacation” takes longer than “Going for a walk”:
+A Study of Temporal Commonsense Understanding
+https://arxiv.org/pdf/1909.03065.pdf
+
+WARNING: Running this task with a `--limit` arg will give misleading results! The 
+corresponding dataset is structured such that each multiple-choice-question gathered
+by the authors is split into question-option pairs, where each such pair gets 
+siloed into an individual document for plausibility testing. Because the harness
+shuffles these documents, setting `--limit` will likely "cut off" certain candidate
+answers. This is a problem because the task's metrics require an exhaustive evaluation 
+of a question's options. See section 4 of the paper for details.
+
+@inproceedings{ZKNR19,
+    author = {Ben Zhou, Daniel Khashabi, Qiang Ning and Dan Roth},
+    title = {“Going on a vacation” takes longer than “Going for a walk”: A Study of Temporal Commonsense Understanding },
+    booktitle = {EMNLP},
+    year = {2019},
+}
+"""
+
+import numpy as np
+from lm_eval.base import rf
+from collections import defaultdict
+from . common import HFTask
+
+
+class MCTACO(HFTask):
+    VERSION = 0
+    DATASET_PATH = "mc_taco"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def fewshot_description(self):
+        return "Determine whether the candidate answer is plausible (\"yes\") or not (\"no\")"
+
+    def doc_to_text(self, doc):
+        return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
+            f"Answer: {doc['answer']}\nPlausible:"
+
+    def doc_to_target(self, doc):
+        return " " + ["no", "yes"][doc['label']]
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        ll_no, _ = rf.loglikelihood(ctx, " no")
+        ll_yes, _ = rf.loglikelihood(ctx, " yes")
+        return ll_no, ll_yes
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        ll_no, ll_yes = results
+        gold = doc['label']
+        pred = int(ll_yes > ll_no)
+        question_id = self._question2id(doc)
+        items = (gold, pred, question_id)
+        return {
+            "em": items,
+            "f1": items
+        }
+
+    def _question2id(self, doc):
+        """ Returns an identifier for the question in the given document. """
+        return " ".join([doc['sentence'], doc['question']])
+
+    def aggregation(self):
+        return {
+            "f1": f1,
+            "em": exact_match,
+        }
+
+    def higher_is_better(self):
+        return {
+            "f1": True,
+            "em": True,
+        }
+
+
+def exact_match(items):
+    """
+    Counts a question as correct if the model accurately classifies the plausibility
+    of an answer for all candidate answers. See section 4 "Evaluation Metrics" in the paper.
+    """
+    results = list(zip(*items))
+    accuracies = defaultdict(list)
+    for gold, pred, question in zip(results[0], results[1], results[2]):
+        accuracies[question].append(pred == gold)
+    return np.mean([int(all(accs)) for accs in accuracies.values()])
+
+
+def f1(items):
+    """ See section 4 "Evaluation Metrics" in the paper about the F1 metric used. """
+    results = list(zip(*items))
+    # Group the positive ("yes" = 1) golds and predictions by question.
+    gold_positives, pred_positives = defaultdict(list), defaultdict(list)
+    for gold, pred, question in zip(results[0], results[1], results[2]):
+        gold_positives[question].append(gold)
+        pred_positives[question].append(pred)
+    f1 = []
+    for question in gold_positives.keys():
+        gp, pp = sum(gold_positives[question]), sum(pred_positives[question])
+        tp = sum(np.logical_and(gold_positives[question], pred_positives[question]))
+        p = tp / pp if pp > 0.0 else 1.0
+        r = tp / gp if gp > 0.0 else 1.0
+        if p + r > 0.0:
+            f1.append(2. * (p * r) / (p + r))
+    return np.mean(f1)
--- a/lm_eval/tasks/mutual.py
+++ b/lm_eval/tasks/mutual.py
+"""
+MuTual: A Dataset for Multi-Turn Dialogue Reasoning
+https://www.aclweb.org/anthology/2020.acl-main.130/
+
+@inproceedings{mutual,
+    title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
+    author = "Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
+    booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
+    year = "2020",
+    publisher = "Association for Computational Linguistics",
+}
+"""
+import json
+import zipfile
+import shutil
+import numpy as np
+from pathlib import Path
+from lm_eval.base import Task, rf
+from lm_eval.metrics import mean
+from best_download import download_file
+
+
+class MuTualBase(Task):
+    VERSION = 0
+    BASE_PATH = Path("data/mutual")
+    DATASET_NAME = None
+    CHOICES = ['A', 'B', 'C', 'D']
+
+    def __init__(self):
+        super().__init__()
+
+    def download(self):
+        if self.BASE_PATH.exists():
+            return
+        Path.mkdir(self.BASE_PATH, parents=True)
+        master_zip = Path("data/master.zip")
+        download_file(
+            "https://github.com/Nealcly/MuTual/archive/master.zip",
+            str(master_zip),
+            "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9")
+        with zipfile.ZipFile(master_zip, 'r') as zip:
+            zip.extractall("data")
+        Path("data/MuTual-master/data").rename(str(self.BASE_PATH))
+        # Remove left over files and directories.
+        master_zip.unlink()
+        shutil.rmtree("data/MuTual-master")
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return False
+
+    def _load_docs(self, path):
+        for file in path.iterdir():
+            if file.suffix != ".txt":
+                continue
+            with open(file, 'r', encoding='utf-8') as f:
+                yield json.load(f)
+
+    def training_docs(self):
+        return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "train")
+
+    def validation_docs(self):
+        return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "dev")
+
+    def test_docs(self):
+        return NotImplemented
+
+    def fewshot_description(self):
+        # TODO: figure out fewshot description
+        return ""
+
+    def doc_to_text(self, doc):
+        return self.detokenize(doc["article"])
+
+    def doc_to_target(self, doc):
+        return " " + self.detokenize(doc["options"][self.CHOICES.index(doc["answers"])])
+
+    def construct_requests(self, doc, ctx):
+        lls = []
+        for option in doc["options"]:
+            lls.append(rf.loglikelihood(ctx, f" {self.detokenize(option)}"))
+        return lls
+
+    def detokenize(self, text):
+        text = text.replace(" '", "'")
+        text = text.replace(" \n", "\n")
+        text = text.replace("\n ", "\n")
+        text = text.replace(" n't", "n't")
+        text = text.replace("`` ", '"')
+        text = text.replace("''", '"')
+        # punctuation
+        text = text.replace(" :", ":")
+        text = text.replace(" ;", ";")
+        text = text.replace(" !", "!")
+        text = text.replace(" ?", "?")
+        text = text.replace(" ,", ",")
+        text = text.replace(" .", ".")
+        return text
+
+    def process_results(self, doc, results):
+        gold = self.CHOICES.index(doc["answers"])
+        r4_1 = np.argmax(results) == gold  # r4_1 = accuracy
+        ranks = sorted(results, reverse=True)
+        r4_2 = (ranks.index(results[gold]) == 1) + r4_1
+        mrr = 1. / (ranks.index(results[gold]) + 1)  # `+ 1` for index offset
+        return {
+            "r@1": r4_1,
+            "r@2": r4_2,
+            "mrr": mrr
+        }
+
+    def aggregation(self):
+        return {
+            "r@1": mean,
+            "r@2": mean,
+            "mrr": mean
+        }
+
+    def higher_is_better(self):
+        return {
+            "r@1": True,
+            "r@2": True,
+            "mrr": True
+        }
+
+
+class MuTual(MuTualBase):
+    DATASET_NAME = Path("mutual")
+
+
+class MuTualPlus(MuTualBase):
+    DATASET_NAME = Path("mutual_plus")
--- a/tests/testdata/mc_taco-v0-loglikelihood
+++ b/tests/testdata/mc_taco-v0-loglikelihood
+1811808ef05afd5f30ffc3471622a3dd7a1b681b17a2f7616695ad6b2a45943c
\ No newline at end of file
--- a/tests/testdata/mc_taco-v0-res.json
+++ b/tests/testdata/mc_taco-v0-res.json
+{"results": {"mc_taco": {"em": 0.07732732732732733, "f1": 0.41600515965511614}}, "versions": {"mc_taco": 0}}
\ No newline at end of file
--- a/tests/testdata/mutual-v0-loglikelihood
+++ b/tests/testdata/mutual-v0-loglikelihood
+f759213a28f0412510bf1a24c9cab0dae64bdee902d42a26225295445e7779db
\ No newline at end of file
--- a/tests/testdata/mutual-v0-res.json
+++ b/tests/testdata/mutual-v0-res.json
+{"results": {"mutual": {"mrr": 0.5023513920240772, "mrr_stderr": 0.009501864812936679, "r@1": 0.22573363431151242, "r@1_stderr": 0.014053085820407457, "r@2": 0.4221218961625282, "r@2_stderr": 0.016602191705517556}}, "versions": {"mutual": 0}}
\ No newline at end of file
--- a/tests/testdata/mutual_plus-v0-loglikelihood
+++ b/tests/testdata/mutual_plus-v0-loglikelihood
+b846bb9db109535f59a93d1ce340cf09f68bdf4fed5b8decd168784220fe07fa
\ No newline at end of file
--- a/tests/testdata/mutual_plus-v0-res.json
+++ b/tests/testdata/mutual_plus-v0-res.json
+{"results": {"mutual_plus": {"mrr": 0.5275583145221953, "mrr_stderr": 0.009940894824430708, "r@1": 0.2595936794582393, "r@1_stderr": 0.014737047402750955, "r@2": 0.45372460496614, "r@2_stderr": 0.01673517854461967}}, "versions": {"mutual_plus": 0}}
\ No newline at end of file