Merge pull request #90 from EleutherAI/no_footguns

Get rid of some footguns

Merge pull request #90 from EleutherAI/no_footguns
Get rid of some footguns
8ae88962 · Stella Biderman · GitHub · 27a859e2 · f4120e59 · 8ae88962
Unverified Commit 8ae88962 authored Jan 21, 2021 by Stella Biderman Committed by GitHub Jan 21, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 89 additions and 14 deletions

lm_eval/tasks/winogrande.py lm_eval/tasks/winogrande.py +44 -6

lm_eval/tasks/wsc273.py lm_eval/tasks/wsc273.py +45 -8

No files found.
--- a/lm_eval/tasks/winogrande.py
+++ b/lm_eval/tasks/winogrande.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 import numpy as np
 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import f1_score, matthews_corrcoef
@@ -32,6 +30,7 @@ class Winogrande(HFTask):
            return self.data["test"]

    def fewshot_description(self):
+        # TODO: redo description
        return "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in."

    def doc_to_text(self, doc):
@@ -48,8 +47,47 @@ class Winogrande(HFTask):
            raise ValueError("Winogrande from HF datasets contained an invalid answer key")
        return text.replace("_", answer)

-    # TODO: Implement evaluation code
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
--- a/lm_eval/tasks/wsc273.py
+++ b/lm_eval/tasks/wsc273.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 import json
 import random
 import os
@@ -38,8 +36,7 @@ class WinogradSchemaChallenge273(Dataset):
        return self.load_doc(myjson)
    
    def fewshot_description(self):
-        # This format is ONLY for the purposes of deduplication. For the task evaluation, we'll need to find a new strategy,
-        # to meet the needs of this particular task.
+        # TODO: redo description
        return "Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."

    def load_doc(self, myjson):
@@ -80,8 +77,48 @@ class WinogradSchemaChallenge273(Dataset):
        text = doc['completions']['T'] + ' True. ' + doc['completions']['F'] + ' False.'
        return text

-    # TODO: Implement evaluation code

-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')