Merge pull request #90 from EleutherAI/no_footguns

Get rid of some footguns

Merge pull request #90 from EleutherAI/no_footguns
Get rid of some footguns
8ae88962 · Stella Biderman · GitHub · 27a859e2 · f4120e59 · 8ae88962
Unverified Commit 8ae88962 authored Jan 21, 2021 by Stella Biderman Committed by GitHub Jan 21, 2021
20 changed files
--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -31,10 +31,14 @@ class GPT2LM(LM):

            cont_toks = inp[:, ctxlen:]  # [batch, seq]
            logits = F.log_softmax(self.gpt2(inp)[0], dim=-1)[:, ctxlen - 1:-1]  # [batch, seq, vocab]
+            
+            greedy_tokens = logits.argmax(dim=-1)
+            max_equal = (greedy_tokens == cont_toks).all()
+
            logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [batch, seq]

-            # TODO: implement isgreedy
-            res.append((float(logits.sum()), False))
+
+            res.append((float(logits.sum()), bool(max_equal)))

        return res
    

--- a/lm_eval/tasks/anli.py
+++ b/lm_eval/tasks/anli.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 from . common import HFTask

 class ANLIBase(HFTask):
@@ -45,11 +43,50 @@ class ANLIBase(HFTask):
    def doc_to_target(self, doc):
        return " " + ["True", "Neither", "False"][doc['label']]

-    # TODO: Implement evaluation code
-
-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

 class ANLIRound1(ANLIBase):
    SPLIT = 1

--- a/lm_eval/tasks/arc.py
+++ b/lm_eval/tasks/arc.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 from . common import HFTask

 class ARCEasy(HFTask):
@@ -25,9 +23,50 @@ class ARCEasy(HFTask):
    def doc_to_target(self, doc):
        return " " + doc['choices']['text'][doc['choices']['label'].index(doc['answerKey'])]

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: implement
-        raise NotImplementedError()
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

 class ARCChallenge(ARCEasy):
    DATASET_PATH = "ai2_arc"

--- a/lm_eval/tasks/coqa.py
+++ b/lm_eval/tasks/coqa.py
@@ -33,22 +33,61 @@ class CoQA(Dataset):
        return  json.load(open('data/coqa/coqa-dev-v1.0.json'))['data']  

    def test_docs(self):
-        pass   
+        pass
    
    def fewshot_description(self):
-        pass
+        # TODO: figure out description
+        return ""
    
-    def doc_to_text(self, doc, include_target=True):
-        text = [doc['story']]
-        for pair in zip(doc['questions'], doc['answers']):
-            text.append('\n\n')
-            text.append(''.join(['Q: ',pair[0]['input_text'], '\n\n']))
-            if include_target:
-                text.append(''.join(['A: ',pair[1]['input_text']]))
-            else:
-                text.append('A: ')
-
-        return ''.join(text)
-
-    def evaluate(self, docs, lm):
-        pass
+    def doc_to_text(self, doc):
+        # TODO: implement.
+        raise NotImplementedError('doc_to_text not implemented')
+
+    def doc_to_target(self, doc):
+        # TODO: implement.
+        raise NotImplementedError('doc_to_target not implemented')
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 import numpy as np
 import json
 from scipy.stats import pearsonr, spearmanr
@@ -60,10 +58,50 @@ class DROP(Dataset):
        return ''.join([doctext, '\n'.join(qa_texts)])

    def fewshot_description(self):
-        return "Read the passage and answer the questions "
+        # TODO: figure out description
+        return ""
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # TODO: Implement evaluation code
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 import numpy as np
 from lm_eval.base import rf, mean, f1_score, matthews_corrcoef
 from scipy.stats import pearsonr, spearmanr
@@ -453,37 +451,47 @@ class STSB(HFTask):
    def doc_to_target(self, doc):
        return " {}".format(doc["label"])

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: Implement evaluation code using new framework
-
-        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
-        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-        # Remove this comment when the evaluation code is implemented.
-        golds = [doc["label"] for doc in docs]
-        preds = []
-        for doc in tqdm_lib.tqdm(docs):
-            ctx = self.fewshot_context(
-                doc=doc,
-                provide_description=provide_description,
-                num_fewshot=num_fewshot,
-            )
-            output = lm.generate(context=ctx, max_gen_length=5).strip()
-            first_element = output.split()[0]
-            if first_element.isnumeric():
-                pred = max(min(float(first_element), 5.0), 0.0)
-            else:
-                pred = 2.5
-            import pdb; pdb.set_trace()
-            preds.append(pred)
-        pearson_corr = float(pearsonr(preds, golds)[0])
-        spearman_corr = float(spearmanr(preds, golds)[0])
-        minor = {
-            "pearson": pearson_corr,
-            "spearmanr": spearman_corr,
-            "corr": (pearson_corr + spearman_corr) / 2,
-        }
-        return {
-            "major": minor["corr"],
-            "minor": minor,
-            "higher_is_better": True,
-        }
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 import numpy as np
 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import f1_score, matthews_corrcoef
@@ -51,8 +49,47 @@ class HellaSwag(HFTask):
            raise ValueError("HellaSwag from HF datasets contained an invalid answer key")
        return doc['endings'][index]

-    # TODO: Implement evaluation code
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 from lm_eval.base import Dataset
 from lm_eval.utils import sh
 import json
@@ -42,12 +40,53 @@ class Lambada(Dataset):
        return self.load_doc(myjson)

    def doc_to_text(self, doc, include_target=True):
-        #TODO: check if this is how OA does it
-        #label = doc[]
-        return doc
+        # TODO: implement.
+    
+    def fewshot_description(self):
+        # TODO: figure out description
+        return ""
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # TODO: Implement evaluation code
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
--- a/lm_eval/tasks/naturalqs.py
+++ b/lm_eval/tasks/naturalqs.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 from . common import HFTask
 from itertools import islice

@@ -46,8 +44,47 @@ class NaturalQs(HFTask):
        long_answer = " ".join(long_answer_chars)
        return long_answer # Replace with short_answer[0] for short answer

-    # TODO: Implement evaluation code
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
--- a/lm_eval/tasks/openbookqa.py
+++ b/lm_eval/tasks/openbookqa.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 import numpy as np
 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import f1_score, matthews_corrcoef
@@ -34,7 +32,8 @@ class OpenBookQA(HFTask):
            return self.data["test"]

    def fewshot_description(self):
-        return "Text of the question prompt\nText of the answer completion"
+        # TODO: figure out fewshot description
+        return ""

    def doc_to_text(self, doc):
        return doc['question_stem'] + '\n'
@@ -53,8 +52,47 @@ class OpenBookQA(HFTask):
            raise ValueError("OpenBookQA from HF datasets contained an invalid answer key")
        return doc['choices']['text'][index] + '.'

-    # TODO: Implement evaluation code
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
--- a/lm_eval/tasks/piqa.py
+++ b/lm_eval/tasks/piqa.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 import json
 import random
 from lm_eval.base import Dataset
@@ -45,7 +43,8 @@ class PiQA(Dataset):
        return self.load_docs('data/piqa/piqa-test.jsonl', None)
    
    def fewshot_description(self):
-        pass
+        # TODO: figure out fewshot description
+        return ""
    
    def doc_to_text(self, doc):
        #TODO: check if oa uses newline
@@ -55,8 +54,47 @@ class PiQA(Dataset):
        rightanswer = int(doc[1][0]) + 1
        return ''.join([doc[0]['goal'],' ',doc[0]['sol'+str(rightanswer)]])

-    # TODO: Implement evaluation code
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
--- a/lm_eval/tasks/quac.py
+++ b/lm_eval/tasks/quac.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 import json
 import random
 import os
@@ -40,6 +38,7 @@ class QuAC(Dataset):
        raise NotImplementedError("QuAC has no test docs.")
    
    def fewshot_description(self):
+        # TODO: figure out fewshot description
        desc = "TITLE: Title of the context passage - subtitle of the passage\nPARAGRAPH: Passage describing the relevant information for answering questions.\n\nQ: Text of a question.\n\nA: Answer to the question, based on the passage. If it cannot be answered based on the passage, write CANNOTANSWER"
        return desc

@@ -61,8 +60,47 @@ class QuAC(Dataset):
    def doc_to_target(self, doc):
        return doc['answer']

-    # TODO: Implement evaluation code
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
--- a/lm_eval/tasks/race.py
+++ b/lm_eval/tasks/race.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 from . common import HFTask
 from ..utils_stream import X, each, apply, join, filt, one
 import collections
@@ -67,8 +65,47 @@ class RACE(HFTask):

        return r

-    # TODO: Implement evaluation code
-
-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
--- a/lm_eval/tasks/sat.py
+++ b/lm_eval/tasks/sat.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 import json
 import random
 import os

--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squad.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 import numpy as np
 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import f1_score, matthews_corrcoef
@@ -42,8 +40,47 @@ class SQuAD(HFTask):
            answer = 'unanswerable'
        return answer

-    # TODO: Implement evaluation code
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
--- a/lm_eval/tasks/storycloze.py
+++ b/lm_eval/tasks/storycloze.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 import json
 import random
 from lm_eval.base import Dataset
@@ -39,7 +37,8 @@ class StoryCloze(Dataset):

    
    def fewshot_description(self):
-        pass
+        # TODO: figure out fewshot description
+        return ""
    
    def doc_to_text(self, doc):
        return ' '.join([*doc[1:5]])
@@ -47,9 +46,47 @@ class StoryCloze(Dataset):
    def doc_to_target(self, doc):
        return " " + doc[int(doc[-1]) - 4]

-    # TODO: Implement evaluation code
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
@@ -356,9 +356,47 @@ class RTE(HFTask):
    def doc_to_target(self, doc):
        return 'True' if doc['label'] == 0 else 'False'

-    # TODO: Implement evaluation code
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 import json
 import random
 from lm_eval.base import Dataset
@@ -37,7 +35,8 @@ class TriviaQA(Dataset):
        return  json.load(open('data/triviaqa/triviaqa-unfiltered/unfiltered-web-test.json'))['Data']     
    
    def fewshot_description(self):
-        pass
+        # TODO: figure out fewshot description
+        return ""
    
    def doc_to_text(self, doc):
        return ''.join(['Q: ', doc['Question'], '\n\n','A: '])
@@ -45,8 +44,47 @@ class TriviaQA(Dataset):
    def doc_to_target(self, doc):
        return doc['Answer']['Aliases'][0]

-    # TODO: Implement evaluation code
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
--- a/lm_eval/tasks/webqs.py
+++ b/lm_eval/tasks/webqs.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 from . common import HFTask

 class WebQs(HFTask):
@@ -29,8 +27,47 @@ class WebQs(HFTask):
        # TODO: make sure we're actually handling multi-answer correctly
        return " " + doc['answers'][0]

-    # TODO: Implement evaluation code
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
--- a/lm_eval/tasks/wikitext.py
+++ b/lm_eval/tasks/wikitext.py
-# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
-
 import numpy as np
 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import f1_score, matthews_corrcoef
@@ -11,16 +9,59 @@ class WikiText103(NLP_TASK):
    NLP_NAME = "wikitext-103-raw-v1"

    def fewshot_description(self):
+        # TODO: figure out fewshot description
        return ""

-    def doc_to_text(self, doc, include_target=True):
-        return doc['text']
+    def doc_to_text(self, doc):
+        # TODO: implement
+
+    def doc_to_target(self, doc):
+        # TODO: implement
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # TODO: Implement evaluation code
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')


 class WikiText2(NLP_TASK):
@@ -28,13 +69,56 @@ class WikiText2(NLP_TASK):
    NLP_NAME = "wikitext-2-raw-v1"

    def fewshot_description(self):
+        # TODO: figure out fewshot description
        return ""

-    def doc_to_text(self, doc, include_target=True):
-        return doc['text']
+    def doc_to_text(self, doc):
+        # TODO: implement
+
+    def doc_to_target(self, doc):
+        # TODO: implement
+
+    def construct_requests(self, doc, ctx):
+        """ Uses RequestFactory to construct Requests and returns an iterable of 
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural 
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`. 
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
+    
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a 
+        dict where keys are the names of submetrics and values are the values of 
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # TODO: Implement evaluation code
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are 
+            functions that aggregate a list of metrics
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')

-    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
-    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
-    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are 
+            whether a higher value of the submetric is better
+        """
+        # TODO: implement evaluation.
+        raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file