Merge branch 'master' into fazz/refactor-task-coqa

6738b241 · thefazzer · 47384df7 · 6598967b · 6738b241 · 6738b241
Commit 6738b241 authored Jan 31, 2021 by thefazzer
Hide whitespace changes
Inline Side-by-side

Showing with 27 additions and 19 deletions

lm_eval/base.py lm_eval/base.py +3 -6

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +3 -3

lm_eval/tasks/anli.py lm_eval/tasks/anli.py +21 -10

No files found.
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -82,22 +82,19 @@ class Dataset(abc.ABC):
        """Whether the task has a test set"""
        pass

-    @abc.abstractmethod
    def training_docs(self):
        """

        :return: Iterable[obj]
            A iterable of any object, that doc_to_text can handle
        """
-        pass
+        return []
    
-    @abc.abstractmethod
    def validation_docs(self):
-        pass
+        return []
    
-    @abc.abstractmethod
    def test_docs(self):
-        pass
+        return []
    
    def fewshot_examples(self, k):
        if self._traindocs is None:

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -59,9 +59,9 @@ TASK_REGISTRY = {
    # "webqs": webqs.WebQs, # not implemented yet
    # "wsc273": wsc273.WinogradSchemaChallenge273, # not implemented yet
    # "winogrande": winogrande.Winogrande, # not implemented yet
-    # "anli_r1": anli.ANLIRound1, # not implemented yet
-    # "anli_r2": anli.ANLIRound2, # not implemented yet
-    # "anli_r3": anli.ANLIRound3, # not implemented yet
+    "anli_r1": anli.ANLIRound1,
+    "anli_r2": anli.ANLIRound2,
+    "anli_r3": anli.ANLIRound3,
    # arithmetic
    "arithmetic_2da": arithmetic.Arithmetic2DPlus,
    "arithmetic_2ds": arithmetic.Arithmetic2DMinus,

--- a/lm_eval/tasks/anli.py
+++ b/lm_eval/tasks/anli.py
+import numpy as np
+from lm_eval.base import rf, mean
 from . common import HFTask

 class ANLIBase(HFTask):
@@ -33,7 +35,6 @@ class ANLIBase(HFTask):
        return ""

    def doc_to_text(self, doc):
-        print(doc)
        # OA does this a bit weirdly: they prepend "anli 1:  anli 1:  " to the beginning
        # of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly 
        # appended onto the question, with no "Answer:" or even a newline. Do we *really* 
@@ -41,6 +42,9 @@ class ANLIBase(HFTask):
        return doc['premise'] + '\nQuestion: ' + doc['hypothesis'] + '\nTrue, False, or Neither?'

    def doc_to_target(self, doc):
+        # True = entailment
+        # False = contradiction
+        # Neither = neutral
        return " " + ["True", "Neither", "False"][doc['label']]

    def construct_requests(self, doc, ctx):
@@ -54,8 +58,10 @@ class ANLIBase(HFTask):
            language description, as well as the few shot examples, and the question
            part of the document for `doc`. 
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        ll_true, _ = rf.loglikelihood(ctx, " True") 
+        ll_neither, _ = rf.loglikelihood(ctx, " Neither") 
+        ll_false, _ = rf.loglikelihood(ctx, " False") 
+        return ll_true, ll_neither, ll_false
    
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a 
@@ -67,8 +73,11 @@ class ANLIBase(HFTask):
        :param results:
            The results of the requests created in construct_requests.
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        gold = doc["label"]
+        pred = np.argmax(results)
+        return {
+            "acc": pred == gold
+        }

    def aggregation(self):
        """
@@ -76,8 +85,9 @@ class ANLIBase(HFTask):
            A dictionary where keys are the names of submetrics and values are 
            functions that aggregate a list of metrics
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        return {
+            "acc": mean
+        }

    def higher_is_better(self):
        """
@@ -85,8 +95,9 @@ class ANLIBase(HFTask):
            A dictionary where keys are the names of submetrics and values are 
            whether a higher value of the submetric is better
        """
-        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        return {
+            "acc": True
+        }

 class ANLIRound1(ANLIBase):
    SPLIT = 1
@@ -95,4 +106,4 @@ class ANLIRound2(ANLIBase):
    SPLIT = 2

 class ANLIRound3(ANLIBase):
-    SPLIT = 3
\ No newline at end of file
+    SPLIT = 3