Add `HellaSwag` evaluation implementation

65c46d22 · Jon Tow · 7031c324 · 65c46d22 · 65c46d22
Commit 65c46d22 authored Jan 28, 2021 by Jon Tow
Hide whitespace changes
Inline Side-by-side

Showing with 33 additions and 23 deletions

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +1 -1

lm_eval/tasks/hellaswag.py lm_eval/tasks/hellaswag.py +32 -22

No files found.
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -40,7 +40,7 @@ TASK_REGISTRY = {
    # "arc_easy": arc.ARCEasy, # not implemented yet
    # "arc_challenge": arc.ARCChallenge, # not implemented yet
    # "quac": quac.QuAC, # not implemented yet
-    # "hellaswag": hellaswag.HellaSwag, # not implemented yet
+    "hellaswag": hellaswag.HellaSwag, # not implemented yet
    # "openbookqa": openbookqa.OpenBookQA, # not implemented yet
    # "sat": sat.SATAnalogies, # not implemented yet
    # "squad": squad.SQuAD, # not implemented yet

--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
 import numpy as np
-from scipy.stats import pearsonr, spearmanr
-from sklearn.metrics import f1_score, matthews_corrcoef
-from tqdm import auto as tqdm_lib
-from . common import HFTask, simple_accuracy_metric, yesno
+from ..base import rf, mean
+from . common import HFTask
+

 class HellaSwag(HFTask):
    DATASET_PATH = "hellaswag"
@@ -30,7 +29,9 @@ class HellaSwag(HFTask):
            return self.data["test"]

    def fewshot_description(self):
-        return "Label for the relevant action: Sentences describing the context, with an incomplete sentence trailing\nanswer that plausibly completes the situation."
+        return "Label for the relevant action: Sentences describing the " \
+            "context, with an incomplete sentence trailing\nanswer that " \
+            "plausibly completes the situation."

    def doc_to_text(self, doc):
        return doc['activity_label'] + ': ' + doc['ctx'] + '\n'
@@ -46,26 +47,29 @@ class HellaSwag(HFTask):
        elif letter_answer == '3':
            index = 3
        else:
-            raise ValueError("HellaSwag from HF datasets contained an invalid answer key")
+            raise ValueError(
+                "HellaSwag from HF datasets contained an invalid answer key")
        return doc['endings'][index]

    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        """ Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

        :param doc:
            The document as returned from training_docs, validation_docs, or test_docs.
        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural 
+            The context string, generated by fewshot_context. This includes the natural
            language description, as well as the few shot examples, and the question
-            part of the document for `doc`. 
+            part of the document for `doc`.
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
-    
+        ll_answers = [
+            rf.loglikelihood(ctx, doc['endings'][i])[0] for i in range(4)
+        ]
+        return ll_answers
+
    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a 
-        dict where keys are the names of submetrics and values are the values of 
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
        the metric for that one document

        :param doc:
@@ -73,23 +77,29 @@ class HellaSwag(HFTask):
        :param results:
            The results of the requests created in construct_requests.
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        gold = int(doc['label'])
+        pred = np.argmax(results)
+        acc = 1. if pred == gold else 0.
+        return {
+            "acc": acc
+        }

    def aggregation(self):
        """
        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are 
+            A dictionary where keys are the names of submetrics and values are
            functions that aggregate a list of metrics
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        return {
+            "acc": mean
+        }

    def higher_is_better(self):
        """
        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are 
+            A dictionary where keys are the names of submetrics and values are
            whether a higher value of the submetric is better
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
+        return {
+            "acc": True
+        }