Merge branch 'bmk_refactor2' of github.com:EleutherAI/lm_evaluation_harness into bmk_refactor2

0b3b7251 · Leo Gao · a18104a4 · 5ce42fc0 · 0b3b7251 · 0b3b7251
Commit 0b3b7251 authored Jan 08, 2021 by Leo Gao
19 changed files
--- a/lm_eval/tasks/anli.py
+++ b/lm_eval/tasks/anli.py
@@ -45,9 +45,11 @@ class ANLIBase(HFTask):
        a = "True, False, or Neither?" + ((" " + ["True", "Neither", "False"][doc['label']]) if include_target else '')
        return q + a

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: implement
-        raise NotImplementedError()
+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.

 class ANLIRound1(ANLIBase):
    SPLIT = 1

--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
@@ -58,23 +58,12 @@ class DROP(Dataset):
                text = ''.join([text, get_answer(pair['answer'])])
            qa_texts.append(text)
        return ''.join([doctext, '\n'.join(qa_texts)])
-            
-    
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        """Take iterable of docs and evaluates, returning a dict with the following format:
-
-        {
-            "major": float,
-            "minor": dict,
-            "higher_is_better": bool,
-        }
-
-        * `major` should be a single, representative number, for programmatic comparison
-        * `minor` should be a dictionary containing all relevant sub-metrics
-        * `higher_is_better` determines whether a higher metric is better
-        """
-        pass

    def fewshot_description(self):
        return "Read the passage and answer the questions "

+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -46,6 +46,12 @@ class CoLA(HFTask):
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
+
+        # TODO: Implement evaluation code using new framework
+
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -99,6 +105,11 @@ class MNLI(HFTask):
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -153,6 +164,11 @@ class MRPC(HFTask):
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -190,6 +206,11 @@ class RTE(HFTask):
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -227,6 +248,11 @@ class QNLI(HFTask):
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -265,6 +291,11 @@ class QQP(HFTask):
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -304,6 +335,11 @@ class STSB(HFTask):
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -359,6 +395,11 @@ class SST(HFTask):
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -397,6 +438,11 @@ class WNLI(HFTask):
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):

--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
@@ -51,6 +51,8 @@ class HellaSwag(HFTask):
            text += doc['endings'][index]
        return text

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: Write evaluation function
-        raise NotImplementedError()
+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
@@ -46,5 +46,8 @@ class Lambada(Dataset):
        #label = doc[]
        return doc

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        pass
\ No newline at end of file
+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/naturalqs.py
+++ b/lm_eval/tasks/naturalqs.py
@@ -50,6 +50,8 @@ class NaturalQs(HFTask):

        return text

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: implement
-        raise NotImplementedError()
\ No newline at end of file
+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/openbookqa.py
+++ b/lm_eval/tasks/openbookqa.py
@@ -53,6 +53,8 @@ class OpenBookQA(HFTask):
            text += doc['choices']['text'][index] + '.'
        return text

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: Write evaluation function
-        raise NotImplementedError()
+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/piqa.py
+++ b/lm_eval/tasks/piqa.py
@@ -54,6 +54,8 @@ class PiQA(Dataset):
        #TODO: check if oa uses newline
        return  doc['goal'] + ' '

-    def evaluate(self, docs, lm):
-        pass
+    # TODO: Implement evaluation code

+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/quac.py
+++ b/lm_eval/tasks/quac.py
@@ -61,5 +61,8 @@ class QuAC(Dataset):
            text += doc['answer']
        return text

-    def evaluate(self, docs, lm):
-        pass
+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/race.py
+++ b/lm_eval/tasks/race.py
@@ -67,6 +67,8 @@ class RACE(HFTask):

        return r

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: implement
-        raise NotImplementedError()
\ No newline at end of file
+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/sat.py
+++ b/lm_eval/tasks/sat.py
@@ -18,6 +18,7 @@ class SATAnalogies(Dataset):
        # We should be using a checksum here.
        # The canonical sha256 hash is below:
        # 9dece377d8d57253ef8c78370ff15de0bb1d9e90a82c815a67ba1e621e921bfc
+
        if not os.path.exists('data/sat/SAT-package-V3.txt'):
            raise NotImplementedError('SAT Analogies dataset is not provided. Follow instructions on https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art) to locate.')

@@ -32,7 +33,6 @@ class SATAnalogies(Dataset):

    def training_docs(self):
        return []
-
    def test_docs(self):
        return []


--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squad.py
@@ -42,6 +42,8 @@ class SQuAD(HFTask):
            text += answer
        return text

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: Write evaluation function
-        raise NotImplementedError()
\ No newline at end of file
+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/storycloze.py
+++ b/lm_eval/tasks/storycloze.py
@@ -47,6 +47,9 @@ class StoryCloze(Dataset):
        else:
            return ' '.join([*doc[1:5]])

-    def evaluate(self, docs, lm):
-        pass
+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.

--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
@@ -82,6 +82,11 @@ class CommitmentBank(HFTask):
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -126,6 +131,11 @@ class Copa(HFTask):
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -171,6 +181,11 @@ class MultiRC(HFTask):
        return f"[{label_str}] {answer}"

    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        preds = []
        for doc in docs:
            ctx = self.fewshot_context(
@@ -226,6 +241,11 @@ class WordsInContext(HFTask):
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -286,6 +306,11 @@ class SGWinogradSchemaChallenge(HFTask):
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
+        # TODO: Implement evaluation code using new framework
+
+        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
+        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
@@ -320,7 +345,10 @@ class RTE(HFTask):
            return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: ', answer])
        else:
            return ''.join([doc['premise'], '\nquestion: ',doc['hypothesis'], ' True or False?\nanswer: '])
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        #TODO: 
-        pass
+    
+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.

--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
@@ -44,6 +44,9 @@ class TriviaQA(Dataset):
            return ''.join(['Q: ', doc['Question'], '\n\n','A: ', doc['Answer']['Aliases'][0]])
        else:
            return ''.join(['Q: ', doc['Question'], '\n\n','A: '])
-    def evaluate(self, docs, lm):
-        pass

+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/webqs.py
+++ b/lm_eval/tasks/webqs.py
@@ -29,6 +29,8 @@ class WebQs(HFTask):
        a = "A:" + ((" " + doc['answers'][0]) if include_target else '')
        return q + a

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: implement
-        raise NotImplementedError()
\ No newline at end of file
+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/wikitext.py
+++ b/lm_eval/tasks/wikitext.py
@@ -15,8 +15,12 @@ class WikiText103(NLP_TASK):

    def doc_to_text(self, doc, include_target=True):
        return doc['text']
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        pass
+
+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.


 class WikiText2(NLP_TASK):
@@ -28,5 +32,9 @@ class WikiText2(NLP_TASK):

    def doc_to_text(self, doc, include_target=True):
        return doc['text']
-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        pass
\ No newline at end of file
+
+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/winogrande.py
+++ b/lm_eval/tasks/winogrande.py
@@ -47,6 +47,8 @@ class Winogrande(HFTask):
            text = text.replace("_", answer)
        return text

-    def evaluate(self, docs, lm, provide_description, num_fewshot):
-        # TODO: Write evaluation function
-        raise NotImplementedError()
\ No newline at end of file
+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.
\ No newline at end of file
--- a/lm_eval/tasks/wsc273.py
+++ b/lm_eval/tasks/wsc273.py
@@ -80,6 +80,8 @@ class WinogradSchemaChallenge273(Dataset):
        text = doc['completions']['T'] + ' True. ' + doc['completions']['F'] + ' False.'
        return text

-    def evaluate(self, docs, lm):
-        # TODO: Write evaluation function
-        raise NotImplementedError()
+    # TODO: Implement evaluation code
+
+    # ***IMPORTANT***: this evaluation function needs to be written for the new framework. 
+    # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
+    # Remove this comment when the evaluation code is implemented.