Merge branch 'big-refactor' into big-refactor

8a42e342 · Lintang Sutawika · GitHub · 346be595 · b6c70fc8 · 8a42e342
Unverified Commit 8a42e342 authored Jun 15, 2023 by Lintang Sutawika Committed by GitHub Jun 15, 2023
6 changed files
--- a/lm_eval/tasks/pile/pile_wikipedia.yaml
+++ b/lm_eval/tasks/pile/pile_wikipedia.yaml
 include: pile_arxiv.yaml
 task: pile_wikipedia
 dataset_name: pile_wikipedia
-
--- a/lm_eval/tasks/wikitext/README.md
+++ b/lm_eval/tasks/wikitext/README.md
@@ -28,9 +28,16 @@ Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-

 ### Subtasks

+* `wikitext`: measure perplexity on the Wikitext dataset, via rolling loglikelihoods.
+
 ### Checklist

- [x] Is in Eval-harness v1.0 ?
- [x] Has been checked for regression from v1.0?
- [ ] Has been checked for equivalence with original paper methodology?
- [ ] "Main" checked variant clearly denoted?
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/main.py
+++ b/main.py
@@ -5,12 +5,11 @@ import argparse
 import logging

 from lm_eval import evaluator, utils
-from lm_eval.api.registry import GROUP_REGISTRY, TASK_REGISTRY
+from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger

 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-ALL_TASKS = sorted(list(TASK_REGISTRY.keys()) + list(GROUP_REGISTRY.keys()))
-print("ALL tasks: ", ALL_TASKS)
+

 class MultiChoice:
    def __init__(self, choices):
@@ -22,9 +21,8 @@ class MultiChoice:
            if len(fnmatch.filter(self.choices, value)) == 0:
                eval_logger.warning("{} is not in task list.".format(value))
                eval_logger.info(f"Available tasks to choose:")
-                # for choice in self.choices:
-                    # eval_logger.info(f"    {choice}")
-                eval_logger.info(ALL_TASKS)
+                for choice in self.choices:
+                    eval_logger.info(f"  - {choice}")
        return True

    def __iter__(self):
@@ -36,7 +34,7 @@ def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", required=True)
    parser.add_argument("--model_args", default="")
-    parser.add_argument("--tasks", default=None, choices=MultiChoice(ALL_TASKS))
+    parser.add_argument("--tasks", default=None, choices=MultiChoice(sorted(ALL_TASKS)))
    parser.add_argument("--config", default=None)
    parser.add_argument("--provide_description", action="store_true")
    parser.add_argument("--num_fewshot", type=int, default=0)

--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -32,10 +32,10 @@ def main():
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)

-    description_dict = {}
-    if args.description_dict_path:
-        with open(args.description_dict_path, "r") as f:
-            description_dict = json.load(f)
+    # description_dict = {}
+    # if args.description_dict_path:
+    #     with open(args.description_dict_path, "r") as f:
+    #         description_dict = json.load(f)

    os.makedirs(args.output_base_path, exist_ok=True)
    for task_name, task in task_dict.items():
@@ -55,11 +55,11 @@ def main():

        docs = join_iters(iters)

-        description = (
-            description_dict[task_name]
-            if description_dict and task_name in description_dict
-            else ""
-        )
+        # description = (
+        #     description_dict[task_name]
+        #     if description_dict and task_name in description_dict
+        #     else ""
+        # )

        with open(os.path.join(args.output_base_path, task_name), "w") as f:
            for i, doc in (

--- a/templates/new_yaml_task/README.md
+++ b/templates/new_yaml_task/README.md
+# Task-name
+
+### Paper
+
+Title: `paper title goes here`
+Abstract: `link to paper PDF or arXiv abstract goes here`
+
+`Short description of paper / benchmark goes here:`
+
+Homepage: `homepage to the benchmark's website goes here, if applicable`
+
+
+### Citation
+
+```
+BibTeX-formatted citation goes here
+```
+
+### Subtasks
+
+List or describe tasks defined in this folder, and their names here:
+* `task_name`: `1-sentence description of what this particular task does`
+* `task_name2`: .....
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/templates/new_yaml_task/blank_yaml.yaml
+++ b/templates/new_yaml_task/blank_yaml.yaml