Implement `DROP` evaluation

fc7cd630 · Jon Tow · f3bf1c07 · fc7cd630 · fc7cd630
Commit fc7cd630 authored Feb 20, 2021 by Jon Tow
Hide whitespace changes
Inline Side-by-side

Showing with 196 additions and 65 deletions

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +2 -0

lm_eval/tasks/drop.py lm_eval/tasks/drop.py +194 -65

No files found.
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -29,6 +29,7 @@ from . import qa4mre
 from . import translation
 from . import headqa
 from . import mathqa
+from . import drop

 ########################################
 # Translation tasks
@@ -83,6 +84,7 @@ TASK_REGISTRY = {
    
    # Order by benchmark/genre?
    "coqa": coqa.CoQA,
+    "drop": drop.DROP,
    "lambada": lambada.LAMBADA,
    "piqa": piqa.PiQA,


--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
-import numpy as np
 import json
-from scipy.stats import pearsonr, spearmanr
-from sklearn.metrics import f1_score, matthews_corrcoef
-from tqdm import auto as tqdm_lib
-from . common import HFTask, simple_accuracy_metric, yesno
+import numpy as np
+import re
+import transformers.data.metrics.squad_metrics as squad_metrics
+from best_download import download_file
+from scipy.optimize import linear_sum_assignment
+from lm_eval.base import Task, rf
+from lm_eval.metrics import mean
 from pathlib import Path
-from ..base import Task
+from zipfile import ZipFile
+

 class DROP(Task):
-    DATAFOLDER = Path(__file__).parent / "../../data/drop"
-    
-    def __init__(self):
-        super().__init__()
+    DATAFOLDER = Path("data/drop")
+    URL = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip"
+
+    def download(self):
+        if self.DATAFOLDER.exists():
+            return
+        Path.mkdir(self.DATAFOLDER)
+        download_file(self.URL, to=str(self.DATAFOLDER / "drop_dataset.zip"))
+        with ZipFile(self.DATAFOLDER / "drop_dataset.zip", "r") as zip:
+            zip.extractall(self.DATAFOLDER)

    def has_training_docs(self):
-        """Whether the task has a training set"""
        return True
-    
+
    def has_validation_docs(self):
-        """Whether the task has a validation set"""
        return True

    def has_test_docs(self):
-        """Whether the task has a test set"""
        return False

-    def training_docs(self):
-        docs = json.load(open(self.DATAFOLDER / 'drop_dataset_train.json'))
-        return [docs[k] for k in docs.keys()]
+    def fewshot_description(self):
+        # TODO: figure out description
+        return ""
+
+    def _load_docs(self, docs):
+        for doc in docs:
+            for qa in doc["qa_pairs"]:
+                yield {
+                    "passage": doc["passage"],
+                    "question": qa["question"],
+                    "answers": self.get_answers(qa["answer"]),
+                }

+    @classmethod
+    def get_answers(cls, answers):
+        # NOTE: We wrap every non-`list` answer into a list for uniformity.
+        if answers["number"] != "":
+            return [answers["number"]]
+        if answers["spans"] != []:
+            return answers["spans"]
+        return [" ".join([answers["date"]["day"],
+                          answers["date"]["month"],
+                          answers["date"]["year"]]).strip()]
+
+    def training_docs(self):
+        docs = json.load(open(self.DATAFOLDER / "drop_dataset" / "drop_dataset_train.json"))
+        return self._load_docs([docs[k] for k in docs.keys()])

    def validation_docs(self):
-        docs = json.load(open(self.DATAFOLDER / 'drop_dataset_dev.json'))
-        return [docs[k] for k in docs.keys()]
-    
+        docs = json.load(open(self.DATAFOLDER / "drop_dataset" / "drop_dataset_dev.json"))
+        return self._load_docs([docs[k] for k in docs.keys()])
+
    def test_docs(self):
        pass
-    
-    def doc_to_text(self, doc, include_target=True):
-        doctext = "Passage: {}\n".format(doc["passage"])
-        qa_texts = []
-        for pair in doc["qa_pairs"]:
-            text = ''.join(['Question: ', pair['question'],'\nAnswer: '])
-            if include_target:
-                def get_answer(ans_dict):
-                    if ans_dict['number'] != '':
-                        return ans_dict['number']
-                    if ans_dict['spans'] != []:
-                        if len(ans_dict['spans']) > 0:
-                            return ', '.join(ans_dict['spans'])
-                        return ans_dict['spans'][0]
-                    return ' '.join([ans_dict['date']['day'], 
-                                     ans_dict['date']['month'], 
-                                     ans_dict['date']['year']]).strip() 
-                text = ''.join([text, get_answer(pair['answer'])])
-            qa_texts.append(text)
-        return ''.join([doctext, '\n'.join(qa_texts)])

-    def fewshot_description(self):
-        # TODO: figure out description
-        return ""
+    def doc_to_text(self, doc):
+        return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
+
+    def doc_to_target(self, doc):
+        return " " + ", ".join(doc["answers"])

    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

-        :param doc:
+         :param doc:
            The document as returned from training_docs, validation_docs, or test_docs.
-        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural 
+         :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
            language description, as well as the few shot examples, and the question
-            part of the document for `doc`. 
+            part of the document for `doc`.
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
-    
+        conts = []
+        for _ in doc["answers"]:
+            conts.append(rf.greedy_until(ctx, ["\n", "."]))
+        return conts
+
    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a 
-        dict where keys are the names of submetrics and values are the values of 
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
        the metric for that one document

-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
+        :param
+                The document as returned from training_docs, validation_docs, or test_docs.
        :param results:
            The results of the requests created in construct_requests.
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        gold, pred = doc["answers"], results
+        print(gold)
+        print(pred)
+        exact_match = self._exact_match(gold, pred)
+        f1_score = self._f1_score(gold, pred)
+        return {"em": exact_match, "f1": f1_score}
+
+    def _exact_match(self, golds, preds):
+        """ Returns the exact match of normalized gold answers and predictions. """
+        normalized_golds = set([self._normalize(gold) for gold in golds])
+        normalized_preds = set([self._normalize(pred) for pred in preds])
+        return int(normalized_golds == normalized_preds)
+
+    def _f1_score(self, golds, preds):
+        """Returns the average F1-score over normalized `gold` and `pred`
+        answer lists.
+        """
+        gold_bags = self._answer_to_bags(golds)
+        print("GOLD BAGS: " + str(gold_bags))
+        pred_bags = self._answer_to_bags(preds)
+        print("PRED BAGS: " + str(pred_bags))
+        f1_per_bag = self._align_bags(gold_bags, pred_bags)
+        return np.mean(f1_per_bag)
+
+    def _answer_to_bags(self, answers):
+        return [set(self._normalize(answer).split()) for answer in answers]
+
+    def _align_bags(self, gold_bags, pred_bags):
+        """ Returns the max metric value over all the answers. """
+        scores = np.zeros([len(gold_bags), len(pred_bags)])
+        for gold_index, gold_bag in enumerate(gold_bags):
+            for pred_index, pred_bag in enumerate(pred_bags):
+                print(self._is_number_match(gold_bag, pred_bag))
+                if self._is_number_match(gold_bag, pred_bag):
+                    scores[gold_index, pred_index] = self._bag_f1(pred_bag, gold_bag)
+        print(scores)
+        row_ind, col_ind = linear_sum_assignment(-scores)
+        max_scores = np.zeros([max(len(gold_bags), len(pred_bags))])
+        for row, column in zip(row_ind, col_ind):
+            max_scores[row] = max(max_scores[row], scores[row, column])
+        return max_scores
+
+    def _bag_f1(self, gold_bag, pred_bag):
+        intersection = len(gold_bag.intersection(pred_bag))
+        if intersection == 0:
+            return 0.0
+        precision = intersection / float(len(pred_bag)) if pred_bag else 1.0
+        recall = intersection / float(len(gold_bag)) if gold_bag else 1.0
+        f1 = (2 * precision * recall) / (precision + recall)
+        return f1
+
+    def _is_number_match(self, gold_bag, pred_bag):
+        gold_numbers = set(filter(lambda s: s.isnumeric(), list(gold_bag)))
+        pred_numbers = set(filter(lambda s: s.isnumeric(), list(pred_bag)))
+        return (not gold_numbers) or gold_numbers.intersection(pred_numbers)
+
+    def _normalize(self, answer):
+        def tokenize(text):
+            return re.split(" |-", text)
+        tokens = [squad_metrics.normalize_answer(token) for token in tokenize(answer)]
+        normalized = " ".join(tokens).strip()
+        return normalized

    def aggregation(self):
        """
        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are 
-            functions that aggregate a list of metrics
+           A dictionary where keys are the names of submetrics and values are
+           A dictionary where keys are the names of submetrics and values are
+           functions that aggregate a list of metrics
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        return {"em": mean, "f1": mean}

    def higher_is_better(self):
        """
        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are 
-            whether a higher value of the submetric is better
+           A dictionary where keys are the names of submetrics and values are
+           A dictionary where keys are the names of submetrics and values are
+           whether a higher value of the submetric is better
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        return {"em": True, "f1": True}
+
+
+# Temporary sanity-checks
+
+
+def main():
+    drop = DROP()
+
+    def test_bags():
+        multiple_answers = ["Pacific Ocean", "Pacific"]
+        ma_bags = drop._answer_to_bags(multiple_answers)
+        print(f"Multiple Choice Answer Bags: {multiple_answers} => {ma_bags}")
+        assert len(ma_bags) == 2
+        number_answer = ["1974"]
+        number_bags = drop._answer_to_bags(number_answer)
+        print(f"Number Bags: {number_answer} => {number_bags}")
+        print()
+    test_bags()
+
+    def test_is_number_match():
+        gold = ["10 29 1999"]
+        pred = ["4 29 1990"]
+        gb = drop._answer_to_bags(gold)
+        pb = drop._answer_to_bags(pred)
+        print(gb)
+        print(pb)
+        for g in gb:
+            for p in pb:
+                match = drop._is_number_match(g, p)
+                print(match)
+        print()
+    #test_is_number_match()
+
+    def test_exact_match():
+        gold = ["Bob Ross"]
+        pred = ["Bob Ross"]
+        em = drop._exact_match(gold, pred)
+        print(em)
+    #test_exact_match()
+
+    def test_f1_score():
+        gold = ["25 to 44"]
+        pred = ["25 to 44 or 45 to 64"]
+        f1 = drop._f1_score(gold, pred)
+        print(gold)
+        print(pred)
+        print(f1)
+        gold = ["300", "1992"]
+        pred = ["300", "1992"]
+        f1 = drop._f1_score(gold, pred)
+        print(f1)
+    #test_f1_score()
+
+
+if __name__ == "__main__":
+    main()