Commit 8dbd24f6 authored by Leo Gao's avatar Leo Gao
Browse files

Merge branch 'master' of github.com:EleutherAI/lm-evaluation-harness into asdiv

parents 4b3dee67 a67c17e0
......@@ -21,7 +21,7 @@ pip install lm-eval
## Basic Usage
To evaluate a model, (e.g. GPT-2) on NLU tasks (e.g. LAMBADA, HellaSwag), you can run the following command.
To evaluate a model, (e.g. GPT-2) on NLU tasks (e.g. LAMBADA, HellaSwag), you can run the following command. **When reporting results from eval harness, please include the task versions (shown in `results["versions"]`) for reproducibility.** This allows bug fixes to tasks while also ensuring that previously reported scores are reproducible. See the [Task Versioning](https://github.com/EleutherAI/lm-evaluation-harness#task-versioning) section for more info.
```bash
python main.py \
......@@ -128,8 +128,9 @@ To implement a new task in eval harness, see [this guide](https://github.com/Ele
|openbookqa |✓ |✓ |✓ | 500|acc, acc_norm |
|squad2 |✓ |✓ | | 11873|exact, f1, HasAns_exact, HasAns_f1, NoAns_exact, NoAns_f1, best_exact, best_f1|
|race |✓ |✓ |✓ | 1045|acc |
|headqa |✓ |✓ |✓ | 2742|acc, acc_norm |
|mathqa |✓ |✓ |✓ | 2985|acc, acc_norm |
|headqa_es |✓ |✓ |✓ | 2742|acc, acc_norm |
|headqa_en |✓ |✓ |✓ | 2742|acc, acc_norm |
|webqs |✓ | |✓ | 2032|acc |
|wsc273 | | |✓ | 273|acc |
|winogrande |✓ |✓ | | 1267|acc |
......
......@@ -134,7 +134,9 @@ TASK_REGISTRY = {
"squad2": squad.SQuAD2,
"race": race.RACE,
# "naturalqs": naturalqs.NaturalQs, # not implemented yet
"headqa": headqa.HeadQA,
"headqa": headqa.HeadQAEsDeprecated, # for backwards compat - headqa used to default to es
"headqa_es": headqa.HeadQAEs,
"headqa_en": headqa.HeadQAEn,
"mathqa": mathqa.MathQA,
"webqs": webqs.WebQs,
"wsc273": wsc273.WinogradSchemaChallenge273,
......
......@@ -2,10 +2,9 @@ from . common import HFTask
from lm_eval.base import MultipleChoiceTask
class HeadQA(HFTask, MultipleChoiceTask):
class HeadQABase(HFTask, MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "head_qa"
DATASET_NAME = None
def has_training_docs(self):
return True
......@@ -31,3 +30,15 @@ class HeadQA(HFTask, MultipleChoiceTask):
def doc_to_text(self, doc):
return doc["query"]
class HeadQAEn(HeadQABase):
DATASET_NAME = "en"
class HeadQAEs(HeadQABase):
DATASET_NAME = "es"
# for backwards compatibility
class HeadQAEsDeprecated(HeadQABase):
DATASET_NAME = "es"
print("WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info.")
\ No newline at end of file
......@@ -18,9 +18,12 @@ class PilePerplexityTask(PerplexityTask, abc.ABC):
def download(self):
# TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
os.makedirs("data/pile/", exist_ok=True)
download_file("https://the-eye.eu/public/AI/pile/val.jsonl.zst", self.VAL_PATH, "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
download_file("https://the-eye.eu/public/AI/pile/test.jsonl.zst", self.TEST_PATH, "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
if not os.path.exists("data/pile/test.jsonl.zst"):
# todo use new best_download fallback api
os.makedirs("data/pile/", exist_ok=True)
download_file("http://eaidata.bmk.sh/data/pile/val.jsonl.zst", self.VAL_PATH, "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
download_file("http://eaidata.bmk.sh/data/pile/test.jsonl.zst", self.TEST_PATH, "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
def validation_docs(self):
rdr = lm_dataformat.Reader(self.VAL_PATH)
......
......@@ -13,7 +13,7 @@ from ..utils import general_detokenize
class BoolQ(HFTask):
VERSION = 0
VERSION = 1
DATASET_PATH = "super_glue"
DATASET_NAME = "boolq"
......@@ -31,7 +31,7 @@ class BoolQ(HFTask):
return "Read the following passages and answer each question with a yes or a no."
def doc_to_text(self, doc):
return f"{doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
return f"{doc['passage']}\nQuestion: {doc['question']}?\nAnswer:"
def doc_to_target(self, doc):
return " " + yesno(doc['label'])
......
......@@ -32,7 +32,7 @@ def test_basic_interface(taskname, task_class):
limit = None
if taskname in ["triviaqa"]:
if taskname in ["triviaqa"] or taskname.startswith("pile_"):
limit = 10000
if task.has_validation_docs():
arr = list(islice(task.validation_docs(), limit))
......
6577e0d88572772ef08e64f624c0e3df0953286ae1f118ccef15623b59ffeabf
\ No newline at end of file
{"results": {"boolq": {"acc": 0.5048929663608562, "acc_stderr": 0.00874463623355505}}, "versions": {"boolq": 1}}
\ No newline at end of file
09da45119b12a0144e3081f8fb790c2a22af7b9c3aac42f54423d348a711fbf5
\ No newline at end of file
{"results": {"headqa_en": {"acc": 0.23559445660102116, "acc_norm": 0.2447118891320204, "acc_norm_stderr": 0.008211629406841468, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa_en": 0}}
\ No newline at end of file
767ca34d9714edd9fb030ddbcc35a64e5180d1e247b0cb557fbb22fdf971ad1f
\ No newline at end of file
{"results": {"headqa_es": {"acc": 0.23559445660102116, "acc_norm": 0.25018234865062, "acc_norm_stderr": 0.008272783230806014, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa_es": 0}}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment