Merge branch 'master' of github.com:EleutherAI/lm-evaluation-harness into asdiv

8dbd24f6 · Leo Gao · 4b3dee67 · a67c17e0 · 8dbd24f6 · 8dbd24f6
Commit 8dbd24f6 authored Jan 03, 2022 by Leo Gao
13 changed files
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ pip install lm-eval

 ## Basic Usage

-To evaluate a model, (e.g. GPT-2) on NLU tasks (e.g. LAMBADA, HellaSwag), you can run the following command.
+To evaluate a model, (e.g. GPT-2) on NLU tasks (e.g. LAMBADA, HellaSwag), you can run the following command. **When reporting results from eval harness, please include the task versions (shown in `results["versions"]`) for reproducibility.** This allows bug fixes to tasks while also ensuring that previously reported scores are reproducible. See the [Task Versioning](https://github.com/EleutherAI/lm-evaluation-harness#task-versioning) section for more info.

 ```bash
 python main.py \
@@ -128,8 +128,9 @@ To implement a new task in eval harness, see [this guide](https://github.com/Ele
 |openbookqa                                               |✓    |✓  |✓   |          500|acc, acc_norm                                                                 |
 |squad2                                                   |✓    |✓  |    |        11873|exact, f1, HasAns_exact, HasAns_f1, NoAns_exact, NoAns_f1, best_exact, best_f1|
 |race                                                     |✓    |✓  |✓   |         1045|acc                                                                           |
-|headqa                                                   |✓    |✓  |✓   |         2742|acc, acc_norm                                                                 |
 |mathqa                                                   |✓    |✓  |✓   |         2985|acc, acc_norm                                                                 |
+|headqa_es                                                |✓    |✓  |✓   |         2742|acc, acc_norm                                                                 |
+|headqa_en                                                |✓    |✓  |✓   |         2742|acc, acc_norm                                                                 |
 |webqs                                                    |✓    |   |✓   |         2032|acc                                                                           |
 |wsc273                                                   |     |   |✓   |          273|acc                                                                           |
 |winogrande                                               |✓    |✓  |    |         1267|acc                                                                           |

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -134,7 +134,9 @@ TASK_REGISTRY = {
    "squad2": squad.SQuAD2,
    "race": race.RACE,
    # "naturalqs": naturalqs.NaturalQs, # not implemented yet
-    "headqa": headqa.HeadQA,
+    "headqa": headqa.HeadQAEsDeprecated, # for backwards compat - headqa used to default to es
+    "headqa_es": headqa.HeadQAEs,
+    "headqa_en": headqa.HeadQAEn,
    "mathqa": mathqa.MathQA,
    "webqs": webqs.WebQs,
    "wsc273": wsc273.WinogradSchemaChallenge273,

--- a/lm_eval/tasks/headqa.py
+++ b/lm_eval/tasks/headqa.py
@@ -2,10 +2,9 @@ from . common import HFTask
 from lm_eval.base import MultipleChoiceTask


-class HeadQA(HFTask, MultipleChoiceTask):
+class HeadQABase(HFTask, MultipleChoiceTask):
    VERSION = 0
    DATASET_PATH = "head_qa"
-    DATASET_NAME = None

    def has_training_docs(self):
        return True
@@ -31,3 +30,15 @@ class HeadQA(HFTask, MultipleChoiceTask):

    def doc_to_text(self, doc):
        return doc["query"]
+
+class HeadQAEn(HeadQABase):
+    DATASET_NAME = "en"
+
+class HeadQAEs(HeadQABase):
+    DATASET_NAME = "es"
+
+# for backwards compatibility
+class HeadQAEsDeprecated(HeadQABase):
+    DATASET_NAME = "es"
+
+    print("WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info.")
\ No newline at end of file
--- a/lm_eval/tasks/pile.py
+++ b/lm_eval/tasks/pile.py
@@ -18,9 +18,12 @@ class PilePerplexityTask(PerplexityTask, abc.ABC):

    def download(self):
        # TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
-        os.makedirs("data/pile/", exist_ok=True)
-        download_file("https://the-eye.eu/public/AI/pile/val.jsonl.zst", self.VAL_PATH, "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
-        download_file("https://the-eye.eu/public/AI/pile/test.jsonl.zst", self.TEST_PATH, "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
+
+        if not os.path.exists("data/pile/test.jsonl.zst"):
+            # todo use new best_download fallback api
+            os.makedirs("data/pile/", exist_ok=True)
+            download_file("http://eaidata.bmk.sh/data/pile/val.jsonl.zst", self.VAL_PATH, "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
+            download_file("http://eaidata.bmk.sh/data/pile/test.jsonl.zst", self.TEST_PATH, "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")

    def validation_docs(self):
        rdr = lm_dataformat.Reader(self.VAL_PATH)

--- a/lm_eval/tasks/superglue.py
+++ b/lm_eval/tasks/superglue.py
@@ -13,7 +13,7 @@ from ..utils import general_detokenize


 class BoolQ(HFTask):
-    VERSION = 0
+    VERSION = 1
    DATASET_PATH = "super_glue"
    DATASET_NAME = "boolq"

@@ -31,7 +31,7 @@ class BoolQ(HFTask):
        return "Read the following passages and answer each question with a yes or a no."

    def doc_to_text(self, doc):
-        return f"{doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
+        return f"{doc['passage']}\nQuestion: {doc['question']}?\nAnswer:"
    
    def doc_to_target(self, doc):
        return " " + yesno(doc['label']) 

--- a/tests/test_cache.db
+++ b/tests/test_cache.db
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -32,7 +32,7 @@ def test_basic_interface(taskname, task_class):

    limit = None

-    if taskname in ["triviaqa"]:
+    if taskname in ["triviaqa"] or taskname.startswith("pile_"):
        limit = 10000
    if task.has_validation_docs():
        arr = list(islice(task.validation_docs(), limit))

--- a/tests/testdata/boolq-v1-loglikelihood
+++ b/tests/testdata/boolq-v1-loglikelihood
+6577e0d88572772ef08e64f624c0e3df0953286ae1f118ccef15623b59ffeabf
\ No newline at end of file
--- a/tests/testdata/boolq-v1-res.json
+++ b/tests/testdata/boolq-v1-res.json
+{"results": {"boolq": {"acc": 0.5048929663608562, "acc_stderr": 0.00874463623355505}}, "versions": {"boolq": 1}}
\ No newline at end of file
--- a/tests/testdata/headqa_en-v0-loglikelihood
+++ b/tests/testdata/headqa_en-v0-loglikelihood
+09da45119b12a0144e3081f8fb790c2a22af7b9c3aac42f54423d348a711fbf5
\ No newline at end of file
--- a/tests/testdata/headqa_en-v0-res.json
+++ b/tests/testdata/headqa_en-v0-res.json
+{"results": {"headqa_en": {"acc": 0.23559445660102116, "acc_norm": 0.2447118891320204, "acc_norm_stderr": 0.008211629406841468, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa_en": 0}}
\ No newline at end of file
--- a/tests/testdata/headqa_es-v0-loglikelihood
+++ b/tests/testdata/headqa_es-v0-loglikelihood
+767ca34d9714edd9fb030ddbcc35a64e5180d1e247b0cb557fbb22fdf971ad1f
\ No newline at end of file
--- a/tests/testdata/headqa_es-v0-res.json
+++ b/tests/testdata/headqa_es-v0-res.json
+{"results": {"headqa_es": {"acc": 0.23559445660102116, "acc_norm": 0.25018234865062, "acc_norm_stderr": 0.008272783230806014, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa_es": 0}}
\ No newline at end of file