SST with PS integration. (It was already done.)

e49cf8da · cjlovering · 31a019c2 · e49cf8da
Commit e49cf8da authored Apr 25, 2022 by cjlovering
Show whitespace changes
Inline Side-by-side

Showing with 30 additions and 78 deletions

lm_eval/tasks/glue.py lm_eval/tasks/glue.py +30 -78

No files found.
--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -79,19 +79,13 @@ class CoLA(PromptSourceTask):
        print(f"PRED: {pred}")
        print("*" * 80)
-        return {
+        return {"mcc": (target, pred)}
-            "mcc": (target, pred)
-        }
    def higher_is_better(self):
-        return {
+        return {"mcc": True}
-            "mcc": True
-        }
    def aggregation(self):
-        return {
+        return {"mcc": matthews_corrcoef}
-            "mcc": matthews_corrcoef
-        }
 class SST(PromptSourceTask):
@@ -116,16 +110,6 @@ class SST(PromptSourceTask):
    def validation_docs(self):
        return self.dataset["validation"]
-    def higher_is_better(self):
-        return {
-            "acc": True
-        }
-    def aggregation(self):
-        return {
-            "acc": mean
-        }
 # Inference Tasks
@@ -160,19 +144,13 @@ class MNLI(PromptSourceTask):
    def process_results(self, doc, results):
        gold = doc["label"]
        pred = np.argmax(results)
-        return {
+        return {"acc": pred == gold}
-            "acc": pred == gold
-        }
    def higher_is_better(self):
-        return {
+        return {"acc": True}
-            "acc": True
-        }
    def aggregation(self):
-        return {
+        return {"acc": mean}
-            "acc": mean
-        }
 class MNLIMismatched(MNLI):
@@ -213,19 +191,13 @@ class QNLI(Task):
        ll_yes, ll_no = results
        pred = ll_no > ll_yes
        gold = doc["label"]
-        return {
+        return {"acc": pred == gold}
-            "acc": pred == gold
-        }
    def higher_is_better(self):
-        return {
+        return {"acc": True}
-            "acc": True
-        }
    def aggregation(self):
-        return {
+        return {"acc": mean}
-            "acc": mean
-        }
 class WNLI(PromptSourceTask):
@@ -252,14 +224,10 @@ class WNLI(PromptSourceTask):
        return self.dataset["validation"]
    def higher_is_better(self):
-        return {
+        return {"acc": True}
-            "acc": True
-        }
    def aggregation(self):
-        return {
+        return {"acc": mean}
-            "acc": mean
-        }
 class RTE(PromptSourceTask):
@@ -285,14 +253,10 @@ class RTE(PromptSourceTask):
        return self.dataset["validation"]
    def higher_is_better(self):
-        return {
+        return {"acc": True}
-            "acc": True
-        }
    def aggregation(self):
-        return {
+        return {"acc": mean}
-            "acc": mean
-        }
 # Similarity and Paraphrase Tasks
@@ -330,16 +294,10 @@ class MRPC(Task):
        }
    def higher_is_better(self):
-        return {
+        return {"acc": True, "f1": True}
-            "acc": True,
-            "f1": True
-        }
    def aggregation(self):
-        return {
+        return {"acc": mean, "f1": f1_score}
-            "acc": mean,
-            "f1": f1_score
-        }
 class QQP(Task):
@@ -388,16 +346,10 @@ class QQP(Task):
        }
    def higher_is_better(self):
-        return {
+        return {"acc": True, "f1": True}
-            "acc": True,
-            "f1": True
-        }
    def aggregation(self):
-        return {
+        return {"acc": mean, "f1": f1_score}
-            "acc": mean,
-            "f1": f1_score
-        }
 class STSB(Task):
@@ -435,7 +387,7 @@ class STSB(Task):
        return " {}".format(doc["label"])
    def construct_requests(self, doc, ctx):
-        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.
        :param doc:
@@ -446,7 +398,7 @@ class STSB(Task):
            part of the document for `doc`.
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")
    def process_results(self, doc, results):
        """Take a single document and the LM results and evaluates, returning a
@@ -459,7 +411,7 @@ class STSB(Task):
            The results of the requests created in construct_requests.
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")
    def aggregation(self):
        """
@@ -468,7 +420,7 @@ class STSB(Task):
            functions that aggregate a list of metrics
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")
    def higher_is_better(self):
        """
@@ -477,4 +429,4 @@ class STSB(Task):
            whether a higher value of the submetric is better
        """
        # TODO: implement evaluation.
-        raise NotImplementedError('Evaluation not implemented')
+        raise NotImplementedError("Evaluation not implemented")