Clean up

e4d852a0 · Jon Tow · f4f7618a · e4d852a0
Commit e4d852a0 authored Feb 21, 2021 by Jon Tow
Hide whitespace changes
Inline Side-by-side

Showing with 23 additions and 77 deletions

lm_eval/tasks/drop.py lm_eval/tasks/drop.py +23 -77

No files found.
--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
@@ -15,8 +15,7 @@ class DROP(Task):
    URL = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip"
    def download(self):
-        if self.DATAFOLDER.exists():
+        if self.DATAFOLDER.exists(): return
-            return
        Path.mkdir(self.DATAFOLDER)
        download_file(self.URL, to=str(self.DATAFOLDER / "drop_dataset.zip"))
        with ZipFile(self.DATAFOLDER / "drop_dataset.zip", "r") as zip:
@@ -39,6 +38,7 @@ class DROP(Task):
        for doc in docs:
            for qa in doc["qa_pairs"]:
                yield {
+                    "id": qa["query_id"],
                    "passage": doc["passage"],
                    "question": qa["question"],
                    "answers": self.get_answers(qa["answer"]),
@@ -48,7 +48,7 @@ class DROP(Task):
    def get_answers(cls, answers):
        # NOTE: We wrap every non-`list` answer into a list for uniformity.
        if answers["number"] != "":
-            return [answers["number"]]
+            return [str(answers["number"])]
        if answers["spans"] != []:
            return answers["spans"]
        return [" ".join([answers["date"]["day"],
@@ -76,16 +76,16 @@ class DROP(Task):
        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.
-         :param doc:
+        :param doc:
            The document as returned from training_docs, validation_docs, or test_docs.
-         :param ctx: str
+        :param ctx: str
            The context string, generated by fewshot_context. This includes the natural
            language description, as well as the few shot examples, and the question
            part of the document for `doc`.
        """
        conts = []
        for _ in doc["answers"]:
-            conts.append(rf.greedy_until(ctx, ["\n", "."]))
+            conts.append(rf.greedy_until(ctx, ["."]))
        return conts
    def process_results(self, doc, results):
@@ -94,16 +94,17 @@ class DROP(Task):
        the metric for that one document
        :param
-                The document as returned from training_docs, validation_docs, or test_docs.
+            The document as returned from training_docs, validation_docs, or test_docs.
        :param results:
            The results of the requests created in construct_requests.
        """
-        gold, pred = doc["answers"], results
+        golds, preds = doc["answers"], results
-        print(gold)
+        exact_match = self._exact_match(golds, preds)
-        print(pred)
+        f1_score = self._f1_score(golds, preds)
-        exact_match = self._exact_match(gold, pred)
+        return {
-        f1_score = self._f1_score(gold, pred)
+            "em": exact_match,
-        return {"em": exact_match, "f1": f1_score}
+            "f1": f1_score
+        }
    def _exact_match(self, golds, preds):
        """ Returns the exact match of normalized gold answers and predictions. """
@@ -112,13 +113,9 @@ class DROP(Task):
        return int(normalized_golds == normalized_preds)
    def _f1_score(self, golds, preds):
-        """Returns the average F1-score over normalized `gold` and `pred`
+        """Returns the average F1-score over normalized gold answers and predictions. """
-        answer lists.
-        """
        gold_bags = self._answer_to_bags(golds)
-        print("GOLD BAGS: " + str(gold_bags))
        pred_bags = self._answer_to_bags(preds)
-        print("PRED BAGS: " + str(pred_bags))
        f1_per_bag = self._align_bags(gold_bags, pred_bags)
        return np.mean(f1_per_bag)
@@ -133,7 +130,6 @@ class DROP(Task):
                print(self._is_number_match(gold_bag, pred_bag))
                if self._is_number_match(gold_bag, pred_bag):
                    scores[gold_index, pred_index] = self._bag_f1(gold_bag, pred_bag)
-        print(scores)
        row_ind, col_ind = linear_sum_assignment(-scores)
        max_scores = np.zeros([max(len(gold_bags), len(pred_bags))])
        for row, column in zip(row_ind, col_ind):
@@ -169,7 +165,10 @@ class DROP(Task):
           A dictionary where keys are the names of submetrics and values are
           functions that aggregate a list of metrics
        """
-        return {"em": mean, "f1": mean}
+        return {
+            "em": mean,
+            "f1": mean
+        }
    def higher_is_better(self):
        """
@@ -178,60 +177,7 @@ class DROP(Task):
           A dictionary where keys are the names of submetrics and values are
           whether a higher value of the submetric is better
        """
-        return {"em": True, "f1": True}
+        return {
+            "em": True,
+            "f1": True
-# Temporary sanity-checks
+        }
-def main():
-    drop = DROP()
-    def test_bags():
-        multiple_answers = ["Pacific Ocean", "Pacific"]
-        ma_bags = drop._answer_to_bags(multiple_answers)
-        print(f"Multiple Choice Answer Bags: {multiple_answers} => {ma_bags}")
-        assert len(ma_bags) == 2
-        number_answer = ["1974"]
-        number_bags = drop._answer_to_bags(number_answer)
-        print(f"Number Bags: {number_answer} => {number_bags}")
-        print()
-    test_bags()
-    def test_is_number_match():
-        gold = ["10 29 1999"]
-        pred = ["4 29 1990"]
-        gb = drop._answer_to_bags(gold)
-        pb = drop._answer_to_bags(pred)
-        print(gb)
-        print(pb)
-        for g in gb:
-            for p in pb:
-                match = drop._is_number_match(g, p)
-                print(match)
-        print()
-    #test_is_number_match()
-    def test_exact_match():
-        gold = ["Bob Ross"]
-        pred = ["Bob Ross"]
-        em = drop._exact_match(gold, pred)
-        print(em)
-    #test_exact_match()
-    def test_f1_score():
-        gold = ["25 to 44"]
-        pred = ["25 to 44 or 45 to 64"]
-        f1 = drop._f1_score(gold, pred)
-        print(gold)
-        print(pred)
-        print(f1)
-        gold = ["300", "1992"]
-        pred = ["300", "1992"]
-        f1 = drop._f1_score(gold, pred)
-        print(f1)
-    #test_f1_score()
-if __name__ == "__main__":
-    main()