Add comments to remind that evaluation needs to be written for the new framework

2d2dbf96 · Leo Gao · 6803e647 · 2d2dbf96 · 2d2dbf96 · 2d2dbf96
Commit 2d2dbf96 authored Jan 05, 2021 by Leo Gao
19 changed files
--- a/lm_eval/tasks/anli.py
+++ b/lm_eval/tasks/anli.py
@@ -45,9 +45,11 @@ class ANLIBase(HFTask):
        a = "True, False, or Neither?" + ((" " + ["True", "Neither", "False"][doc['label']]) if include_target else '')
        return q + a
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    # TODO: Implement evaluation code
-        # TODO: implement
-        raise NotImplementedError()
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
 class ANLIRound1(ANLIBase):
    SPLIT = 1

--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
@@ -59,22 +59,11 @@ class DROP(Dataset):
            qa_texts.append(text)
        return ''.join([doctext, '\n'.join(qa_texts)])
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        """Take iterable of docs and evaluates, returning a dict with the following format:
-        {
-            "major": float,
-            "minor": dict,
-            "higher_is_better": bool,
-        }
-        * `major` should be a single, representative number, for programmatic comparison
-        * `minor` should be a dictionary containing all relevant sub-metrics
-        * `higher_is_better` determines whether a higher metric is better
-        """
-        pass
    def fewshot_description(self):
        return "Read the passage and answer the questions "
+    # TODO: Implement evaluation code
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -46,6 +46,12 @@ class CoLA(HFTask):
        return text
    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -99,6 +105,11 @@ class MNLI(HFTask):
        return text
    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -153,6 +164,11 @@ class MRPC(HFTask):
        return text
    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -190,6 +206,11 @@ class RTE(HFTask):
        return text
    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -227,6 +248,11 @@ class QNLI(HFTask):
        return text
    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -265,6 +291,11 @@ class QQP(HFTask):
        return text
    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -304,6 +335,11 @@ class STSB(HFTask):
        return text
    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -359,6 +395,11 @@ class SST(HFTask):
        return text
    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -397,6 +438,11 @@ class WNLI(HFTask):
        return text
    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):

--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
@@ -51,6 +51,8 @@ class HellaSwag(HFTask):
            text += doc['endings'][index]
        return text
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    # TODO: Implement evaluation code
-        # TODO: Write evaluation function
-        raise NotImplementedError()
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
@@ -46,5 +46,8 @@ class Lambada(Dataset):
        #label = doc[]
        return doc
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    # TODO: Implement evaluation code
-        pass
\ No newline at end of file
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/naturalqs.py
+++ b/lm_eval/tasks/naturalqs.py
@@ -50,6 +50,8 @@ class NaturalQs(HFTask):
        return text
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    # TODO: Implement evaluation code
-        # TODO: implement
-        raise NotImplementedError()
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
\ No newline at end of file
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/openbookqa.py
+++ b/lm_eval/tasks/openbookqa.py
@@ -53,6 +53,8 @@ class OpenBookQA(HFTask):
            text += doc['choices']['text'][index] + '.'
        return text
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    # TODO: Implement evaluation code
-        # TODO: Write evaluation function
-        raise NotImplementedError()
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/piqa.py
+++ b/lm_eval/tasks/piqa.py
@@ -54,6 +54,8 @@ class PiQA(Dataset):
        #TODO: check if oa uses newline
        return  doc['goal'] + ' '
-    def evaluate(self, docs, lm):
+    # TODO: Implement evaluation code
-        pass
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/quac.py
+++ b/lm_eval/tasks/quac.py
@@ -61,5 +61,8 @@ class QuAC(Dataset):
            text += doc['answer']
        return text
-    def evaluate(self, docs, lm):
+    # TODO: Implement evaluation code
-        pass
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/race.py
+++ b/lm_eval/tasks/race.py
@@ -67,6 +67,8 @@ class RACE(HFTask):
        return r
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    # TODO: Implement evaluation code
-        # TODO: implement
-        raise NotImplementedError()
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
\ No newline at end of file
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/sat.py
+++ b/lm_eval/tasks/sat.py
@@ -93,6 +93,8 @@ class SATAnalogies(Dataset):
        return text
-    def evaluate(self, docs, lm):
+    # TODO: Implement evaluation code
-        # TODO: Write evaluation function
-        raise NotImplementedError()
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squad.py
@@ -42,6 +42,8 @@ class SQuAD(HFTask):
            text += answer
        return text
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    # TODO: Implement evaluation code
-        # TODO: Write evaluation function
-        raise NotImplementedError()
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
\ No newline at end of file
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/storycloze.py
+++ b/lm_eval/tasks/storycloze.py
@@ -47,6 +47,9 @@ class StoryCloze(Dataset):
        else:
            return ' '.join([*doc[1:5]])
-    def evaluate(self, docs, lm):
+    # TODO: Implement evaluation code
-        pass
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
@@ -76,6 +76,11 @@ class CommitmentBank(HFTask):
        return text
    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -120,6 +125,11 @@ class Copa(HFTask):
        return text
    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -165,6 +175,11 @@ class MultiRC(HFTask):
        return f"[{label_str}] {answer}"
    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        preds = []
        for doc in docs:
            ctx = self.fewshot_context(
@@ -220,6 +235,11 @@ class WordsInContext(HFTask):
        return text
    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -280,6 +300,11 @@ class SGWinogradSchemaChallenge(HFTask):
        return text
    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -314,7 +339,10 @@ class RTE(HFTask):
            return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: ', answer])
        else:
            return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: '])
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        #TODO: 
+    # TODO: Implement evaluation code
-        pass
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
@@ -44,6 +44,9 @@ class TriviaQA(Dataset):
            return ''.join(['Q: ', doc['Question'], '\n\n','A: ', doc['Answer']['Aliases'][0]])
        else:
            return ''.join(['Q: ', doc['Question'], '\n\n','A: '])
-    def evaluate(self, docs, lm):
-        pass
+    # TODO: Implement evaluation code
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/webqs.py
+++ b/lm_eval/tasks/webqs.py
@@ -29,6 +29,8 @@ class WebQs(HFTask):
        a = "A:" + ((" " + doc['answers'][0]) if include_target else '')
        return q + a
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    # TODO: Implement evaluation code
-        # TODO: implement
-        raise NotImplementedError()
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
\ No newline at end of file
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/wikitext.py
+++ b/lm_eval/tasks/wikitext.py
@@ -15,8 +15,12 @@ class WikiText103(NLP_TASK):
    def doc_to_text(self, doc, include_target=True):
        return doc['text']
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        pass
+    # TODO: Implement evaluation code
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
 class WikiText2(NLP_TASK):
@@ -28,5 +32,9 @@ class WikiText2(NLP_TASK):
    def doc_to_text(self, doc, include_target=True):
        return doc['text']
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        pass
+    # TODO: Implement evaluation code
\ No newline at end of file
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/winogrande.py
+++ b/lm_eval/tasks/winogrande.py
@@ -47,6 +47,8 @@ class Winogrande(HFTask):
            text = text.replace("_", answer)
        return text
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
+    # TODO: Implement evaluation code
-        # TODO: Write evaluation function
-        raise NotImplementedError()
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
\ No newline at end of file
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/wsc273.py
+++ b/lm_eval/tasks/wsc273.py
@@ -80,6 +80,8 @@ class WinogradSchemaChallenge273(Dataset):
        text = doc['completions']['T'] + ' True. ' + doc['completions']['F'] + ' False.'
        return text
-    def evaluate(self, docs, lm):
+    # TODO: Implement evaluation code
-        # TODO: Write evaluation function
-        raise NotImplementedError()
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.