Skeleton of SQuADv2. Not yet tested.

f48b119d · Charles Foster · 3d432b1a · f48b119d
Commit f48b119d authored Feb 09, 2021 by Charles Foster
Hide whitespace changes
Inline Side-by-side

Showing with 55 additions and 18 deletions

lm_eval/tasks/squad.py lm_eval/tasks/squad.py +55 -18

No files found.
--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squad.py
-import numpy as np
-from scipy.stats import pearsonr, spearmanr
-from sklearn.metrics import f1_score, matthews_corrcoef
+import datasets
 from tqdm import auto as tqdm_lib
-from . common import HFTask, simple_accuracy_metric, yesno
+
+from lm_eval.base import rf, f1_score, mean
+from . common import HFTask

 class SQuAD(HFTask):
    DATASET_PATH = "squad_v2"
@@ -18,16 +18,14 @@ class SQuAD(HFTask):
        return False

    def training_docs(self):
-        if self.has_training_docs():
-            return self.data["train"]
+        return self.data["train"]

    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.data["validation"]
+        return self.data["validation"]

    def fewshot_description(self):
-        # TODO: redo description
-        return "Title: The_Title_of_It\n\nBackground: A text passage as background to answer the question with.\n\nQ: Question about the passage.\n\nA: Answer."
+        # TODO: figure out description
+        return ""

    def doc_to_text(self, doc):
        return 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Q: ' + doc['question'] + '\n\n' + 'A: '
@@ -51,8 +49,8 @@ class SQuAD(HFTask):
            language description, as well as the few shot examples, and the question
            part of the document for `doc`. 
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        continuation = rf.greedy_until(ctx, ['\n'])
+        return continuation
    
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a 
@@ -64,8 +62,21 @@ class SQuAD(HFTask):
        :param results:
            The results of the requests created in construct_requests.
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        squad_metric = datasets.load_metric("squad_v2")
+
+        predictions = {
+            'id': doc['id'],
+            'prediction_text': results[0],
+        }
+
+        references = {
+            'id': doc['id'],
+            'answers': doc['answers'],
+        }
+
+        metrics = squad_metric.compute(predictions=predictions, references=references)
+
+        return metrics

    def aggregation(self):
        """
@@ -73,8 +84,21 @@ class SQuAD(HFTask):
            A dictionary where keys are the names of submetrics and values are 
            functions that aggregate a list of metrics
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        return { 
+            'exact': mean, # Exact match (the normalized answer exactly match the gold answer)
+            'f1': mean, #  The F-score of predicted tokens versus the gold answer
+            'total': mean, # Number of score considered
+            'HasAns_exact': mean, # Exact match (the normalized answer exactly match the gold answer)
+            'HasAns_f1': mean, # The F-score of predicted tokens versus the gold answer
+            'HasAns_total': mean, # Number of score considered
+            'NoAns_exact': mean, # Exact match (the normalized answer exactly match the gold answer)
+            'NoAns_f1': mean, # The F-score of predicted tokens versus the gold answer
+            'NoAns_total': mean, # Number of score considered
+            'best_exact': mean, # Best exact match (with varying threshold)
+            'best_exact_thresh': mean, # No-answer probability threshold associated to the best exact match
+            'best_f1': mean, # Best F1 (with varying threshold)
+            'best_f1_thresh': mean, # No-answer probability threshold associated to the best F1
+        }

    def higher_is_better(self):
        """
@@ -82,5 +106,18 @@ class SQuAD(HFTask):
            A dictionary where keys are the names of submetrics and values are 
            whether a higher value of the submetric is better
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        return { 
+            'exact': True, # Exact match (the normalized answer exactly match the gold answer)
+            'f1': True, #  The F-score of predicted tokens versus the gold answer
+            'total': None, # Number of score considered
+            'HasAns_exact': True, # Exact match (the normalized answer exactly match the gold answer)
+            'HasAns_f1': True, # The F-score of predicted tokens versus the gold answer
+            'HasAns_total': None, # Number of score considered
+            'NoAns_exact': True, # Exact match (the normalized answer exactly match the gold answer)
+            'NoAns_f1': True, # The F-score of predicted tokens versus the gold answer
+            'NoAns_total': None, # Number of score considered
+            'best_exact': True, # Best exact match (with varying threshold)
+            'best_exact_thresh': None, # No-answer probability threshold associated to the best exact match
+            'best_f1': True, # Best F1 (with varying threshold)
+            'best_f1_thresh': None, # No-answer probability threshold associated to the best F1
+        }