Merge branch 'master' into thomas/fix_best_download_version

78824d7f · Thomas Wang · GitHub · c65412e5 · cc238121 · 78824d7f
Unverified Commit 78824d7f authored Jan 08, 2022 by Thomas Wang Committed by GitHub Jan 08, 2022
20 changed files
--- a/lm_eval/tasks/hendrycks_test.py
+++ b/lm_eval/tasks/hendrycks_test.py
@@ -114,9 +114,5 @@ class GeneralHendrycksTest(MultipleChoiceTask):
        return rnd.sample(list(self._fewshot_docs), k)
-    def fewshot_description(self):
-        subject = self.subject.replace("_", " ")
-        return f"The following are multiple choice questions (with answers) about {subject}."
    def doc_to_text(self, doc):
        return doc["query"]
--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
@@ -47,10 +47,6 @@ class LAMBADA(Task):
    def doc_to_target(self, doc):
        return " " + doc['text'].rsplit(' ', 1)[1]
-    def fewshot_description(self):
-        # TODO: figure out description
-        return ""
    def construct_requests(self, doc, ctx):
        ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))

--- a/lm_eval/tasks/lambada_cloze.py
+++ b/lm_eval/tasks/lambada_cloze.py
@@ -13,6 +13,3 @@ class LAMBADA_cloze(LAMBADA):
    def doc_to_target(self, doc):
        return " " + doc['text'].rsplit(' ', 1)[1]
-    def fewshot_description(self):
-        return "Fill in blank:\n"
--- a/lm_eval/tasks/logiqa.py
+++ b/lm_eval/tasks/logiqa.py
@@ -80,9 +80,5 @@ class LogiQA(MultipleChoiceTask):
    def test_docs(self):
        return self._load_docs(self.DATASET_PATH / "Test.txt")
-    def fewshot_description(self):
-        # TODO: figure out actual description
-        return ""
    def doc_to_text(self, doc):
        return doc["query"]
--- a/lm_eval/tasks/mathqa.py
+++ b/lm_eval/tasks/mathqa.py
@@ -29,9 +29,5 @@ class MathQA(HFTask, MultipleChoiceTask):
        }
        return out_doc
-    def fewshot_description(self):
-        # TODO: figure out description
-        return ""
    def doc_to_text(self, doc):
        return doc["query"]
--- a/lm_eval/tasks/mc_taco.py
+++ b/lm_eval/tasks/mc_taco.py
@@ -39,9 +39,6 @@ class MCTACO(HFTask):
    def has_test_docs(self):
        return True
-    def fewshot_description(self):
-        return "Determine whether the candidate answer is plausible (\"yes\") or not (\"no\")"
    def doc_to_text(self, doc):
        return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
            f"Answer: {doc['answer']}\nPlausible:"

--- a/lm_eval/tasks/mutual.py
+++ b/lm_eval/tasks/mutual.py
@@ -70,10 +70,6 @@ class MuTualBase(Task):
    def test_docs(self):
        return NotImplemented
-    def fewshot_description(self):
-        # TODO: figure out fewshot description
-        return ""
    def doc_to_text(self, doc):
        return self.detokenize(doc["article"])

--- a/lm_eval/tasks/naturalqs.py
+++ b/lm_eval/tasks/naturalqs.py
@@ -21,10 +21,6 @@ class NaturalQs(HFTask):
    def has_test_docs(self):
        return False
-    def fewshot_description(self):
-        # TODO: figure out description
-        return ""
    def training_docs(self):
        # Cache training for faster few-shot.
        # Data is too large to fit in memory.

--- a/lm_eval/tasks/openbookqa.py
+++ b/lm_eval/tasks/openbookqa.py
@@ -25,9 +25,5 @@ class OpenBookQA(HFTask, MultipleChoiceTask):
        }
        return out_doc
-    def fewshot_description(self):
-        # TODO: figure out fewshot description
-        return ""
    def doc_to_text(self, doc):
        return doc["query"]
--- a/lm_eval/tasks/pile.py
+++ b/lm_eval/tasks/pile.py
@@ -10,7 +10,7 @@ from best_download import download_file
 class PilePerplexityTask(PerplexityTask, abc.ABC):
-    VERSION = 0
+    VERSION = 1
    PILE_SET_NAME = None
    VAL_PATH = 'data/pile/val.jsonl.zst'
@@ -18,9 +18,11 @@ class PilePerplexityTask(PerplexityTask, abc.ABC):
    def download(self):
        # TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
-        os.makedirs("data/pile/", exist_ok=True)
+        if not os.path.exists("data/pile/test.jsonl.zst"):
-        download_file("https://the-eye.eu/public/AI/pile/val.jsonl.zst", local_file=self.VAL_PATH, expected_checksum="264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
+            # todo use new best_download fallback api
-        download_file("https://the-eye.eu/public/AI/pile/test.jsonl.zst", local_file=self.TEST_PATH, expected_checksum="0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
+            os.makedirs("data/pile/", exist_ok=True)
+            download_file("http://eaidata.bmk.sh/data/pile/val.jsonl.zst", local_file=self.VAL_PATH, expected_checksum="264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
+            download_file("http://eaidata.bmk.sh/data/pile/test.jsonl.zst", local_file=self.TEST_PATH, expected_checksum="0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
    def validation_docs(self):
        rdr = lm_dataformat.Reader(self.VAL_PATH)

--- a/lm_eval/tasks/piqa.py
+++ b/lm_eval/tasks/piqa.py
@@ -18,10 +18,6 @@ class PiQA(HFTask, MultipleChoiceTask):
    def has_test_docs(self):
        return False
-    def fewshot_description(self):
-        # TODO: figure out fewshot description
-        return ""
    def _convert_standard(self, doc):
        out_doc = {
            "goal": doc["goal"],

--- a/lm_eval/tasks/prost.py
+++ b/lm_eval/tasks/prost.py
@@ -36,13 +36,14 @@ class PROST(HFTask, MultipleChoiceTask):
    def has_test_docs(self):
        return True
-    def fewshot_description(self):
+    def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
-        # TODO: figure out fewshot description
-        return ""
-    def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
        assert num_fewshot == 0, 'PROST is designed to probe models in a zero-shot fashion only.'
-        return super().fewshot_context(doc, num_fewshot, provide_description, rnd)
+        return super().fewshot_context(
+            doc=doc,
+            num_fewshot=num_fewshot,
+            rnd=rnd,
+            description=description
+        )
    def _convert_standard(self, doc):
        out_doc = {

--- a/lm_eval/tasks/pubmedqa.py
+++ b/lm_eval/tasks/pubmedqa.py
@@ -23,11 +23,6 @@ class Pubmed_QA(HFTask):
            # HF is labelled as train but its really just for testing
            return self.data["train"]
-    def fewshot_description(self):
-        # Average ctx length in labelled dataset is 238.9
-        # 2 few-shot exmamples pushes it beyond context window
-        return ""
    def doc_to_text(self, doc):
        ctxs = "\n".join(doc["context"]["contexts"])
        return "Abstract: {}\nQuestion: {}\nAnswer:".format(

--- a/lm_eval/tasks/qa4mre.py
+++ b/lm_eval/tasks/qa4mre.py
@@ -67,9 +67,6 @@ class QA4MRE(MultipleChoiceTask):
                out_doc['source'] = src
                yield out_doc
-    def fewshot_description(self):
-        return ""
    def test_docs(self):
        return self.load_docs(f"data/qa4mre/QA4MRE-{self.YEAR}-EN_GS.xml")

--- a/lm_eval/tasks/quac.py
+++ b/lm_eval/tasks/quac.py
@@ -51,11 +51,6 @@ class QuAC(Task):
    def test_docs(self):
        raise NotImplementedError("QuAC has no test docs.")
-    def fewshot_description(self):
-        # TODO: figure out fewshot description
-        desc = "TITLE: Title of the context passage - subtitle of the passage\nPARAGRAPH: Passage describing the relevant information for answering questions.\n\nQ: Text of a question.\n\nA: Answer to the question, based on the passage. If it cannot be answered based on the passage, write CANNOTANSWER"
-        return desc
    def load_doc(self, myjson):
        docs = []
        for item in myjson:

--- a/lm_eval/tasks/race.py
+++ b/lm_eval/tasks/race.py
@@ -65,10 +65,6 @@ class RACE(HFTask):
    def test_docs(self):
        return self._collate_data("test")
-    def fewshot_description(self):
-        # TODO: figure out description
-        return ""
    @classmethod
    def get_answer_option(cls, problem):
        answer = cls.letter_to_num[problem['answer']]

--- a/lm_eval/tasks/sat.py
+++ b/lm_eval/tasks/sat.py
@@ -61,10 +61,5 @@ class SATAnalogies(MultipleChoiceTask):
            }
            yield doc
-    def fewshot_description(self):
-        # TODO: figure out actual description
-        return ""
    def doc_to_text(self, doc):
        return "{} is to {} as".format(*doc['query'])
--- a/lm_eval/tasks/sciq.py
+++ b/lm_eval/tasks/sciq.py
@@ -50,9 +50,6 @@ class SciQ(MultipleChoiceTask):
        for record in docs:
            yield self._convert_standard(record)
-    def fewshot_description(self):
-        return ""
    def training_docs(self):
        return self.load_docs("data/sciq/SciQ dataset-2 3/train.json")
@@ -63,4 +60,4 @@ class SciQ(MultipleChoiceTask):
        return self.load_docs("data/sciq/SciQ dataset-2 3/test.json")
    def doc_to_text(self, doc):
        return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]).strip()
\ No newline at end of file
--- a/lm_eval/tasks/squad.py
+++ b/lm_eval/tasks/squad.py
@@ -41,10 +41,6 @@ class SQuAD2(HFTask):
    def validation_docs(self):
        return self.data["validation"]
-    def fewshot_description(self):
-        # TODO: figure out description
-        return ""
    def doc_to_text(self, doc):
        return 'Title: ' + doc['title'] + '\n\n' + 'Background: ' + doc['context'] + '\n\n' + 'Question: ' + doc['question'] + '\n\n' + 'Answer:'

--- a/lm_eval/tasks/storycloze.py
+++ b/lm_eval/tasks/storycloze.py
@@ -27,18 +27,12 @@ class StoryCloze(Task):
            filereader = csv.reader(file)
            return list(filereader)
    def validation_docs(self):
        return self.load_doc("data/storycloze/cloze_test_val__winter2018-cloze_test_ALL_val - 1 - 1.csv")
    def test_docs(self):
        return self.load_doc("data/storycloze/cloze_test_test__winter2018-cloze_test_ALL_test - 1.csv")
-    def fewshot_description(self):
-        # TODO: figure out fewshot description
-        return ""
    def doc_to_text(self, doc):
        return ' '.join([*doc[1:5]])