Cleanup for v0.4.2 release (#1573)

* Update interface.md * fix: make caching reqs always work with accelerate launch * remove stale task migration checklist * remove deprecation warnings * make informative TypeErrors for get_task_dict * bump version metadata * fix num_fewshot printing bug * add fewshot value to cache key

Cleanup for v0.4.2 release (#1573)
* Update interface.md * fix: make caching reqs always work with accelerate launch * remove stale task migration checklist * remove deprecation warnings * make informative TypeErrors for get_task_dict * bump version metadata * fix num_fewshot printing bug * add fewshot value to cache key
5627e819 · Hailey Schoelkopf · GitHub · 6fae67a6 · 5627e819 · 5627e819
Unverified Commit 5627e819 authored Mar 18, 2024 by Hailey Schoelkopf Committed by GitHub Mar 18, 2024
7 changed files
--- a/docs/interface.md
+++ b/docs/interface.md
@@ -112,8 +112,8 @@ my_model = initialize_my_model()
 # - `Your_LM.generate_until()`
 lm_obj = Your_LM(model=my_model, batch_size=16)
-# The task_manager indexes tasks including ones
+# optional: the task_manager indexes tasks including ones
-# specified by the user through `include_path`
+# specified by the user through `include_path`.
 task_manager = lm_eval.tasks.TaskManager(
    include_path="/path/to/custom/yaml"
    )
@@ -138,9 +138,9 @@ task_dict = lm_eval.tasks.get_task_dict(
                 # custom paths is required.
    )
-def evaluate(
+results = evaluate(
    lm=lm_obj,
    task_dict=task_dict,
    ...
-):
+)
 ```
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -13,7 +13,7 @@ import numpy as np
 from lm_eval import evaluator, utils
 from lm_eval.evaluator import request_caching_arg_to_dict
 from lm_eval.logging_utils import WandbLogger
-from lm_eval.tasks import TaskManager, include_path, initialize_tasks
+from lm_eval.tasks import TaskManager
 from lm_eval.utils import make_table, simple_parse_args_string
@@ -258,7 +258,8 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
            "Specify --output_path if providing --log_samples or --predict_only"
        )
-    initialize_tasks(args.verbosity)
+    if args.include_path is not None:
+        eval_logger.info(f"Including path: {args.include_path}")
    task_manager = TaskManager(args.verbosity, include_path=args.include_path)
    if args.limit:
@@ -266,9 +267,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
            " --limit SHOULD ONLY BE USED FOR TESTING."
            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )
-    if args.include_path is not None:
-        eval_logger.info(f"Including path: {args.include_path}")
-        include_path(args.include_path)
    if args.tasks is None:
        eval_logger.error("Need to specify task to evaluate.")

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -376,7 +376,7 @@ class Task(abc.ABC):
        # used with caching
        og_limit = limit
-        cache_key = f"requests-{self._config.task}"
+        cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}"
        cached_instances = load_from_cache(file_name=cache_key)

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -189,10 +189,6 @@ def simple_evaluate(
    if task_manager is None:
        task_manager = TaskManager(verbosity)
-    eval_logger.info(
-        "get_task_dict has been updated to accept an optional argument, `task_manager`"
-        "Read more here:https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
-    )
    task_dict = get_task_dict(tasks, task_manager)
    for task_name in task_dict.keys():
        task_obj = task_dict[task_name]
@@ -215,6 +211,8 @@ def simple_evaluate(
            # we have to change the class properties post-hoc. This is pretty hacky.
            task_obj.override_metric(metric_name="bypass")
+        # override tasks' fewshot values to the provided num_fewshot arg value
+        # except if tasks have it set to 0 manually in their configs--then we should never overwrite that
        if num_fewshot is not None:
            if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
                eval_logger.info(
@@ -225,6 +223,10 @@ def simple_evaluate(
                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
                )
                task_obj.set_config(key="num_fewshot", value=num_fewshot)
+        else:
+            # if num_fewshot not provided, and the task does not define a default one, default to 0
+            if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
+                task_obj.set_config(key="num_fewshot", value=0)
    if check_integrity:
        run_task_tests(task_list=tasks)

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
-# v1.0 Tasks
-This list keeps track of which tasks' implementations have been ported to YAML / v2.0 of the Eval Harness.
-Boxes should be checked iff tasks are implemented in the refactor and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation. (WIP) Denotes that there exists a PR or person working on this task already.
- [x] Glue
- [x] SuperGlue
- [x] CoQA
- [x] DROP
- [x] ~~Lambada~~
- [x] Lambada (Cloze variants)
- [x] ~~Lambada (Multilingual)~~
- [x] Wikitext
- [x] PiQA
- [x] PROST
- [x] MCTACO
- [x] Pubmed QA
- [x] SciQ
- [x] QASPER
- [x] QA4MRE
- [x] TriviaQA
- [x] AI2 ARC
- [x] LogiQA
- [x] HellaSwag
- [x] SWAG
- [x] OpenBookQA
- [ ] SQuADv2 (Lintang)
- [x] RACE
- [x] HeadQA
- [x] MathQA
- [x] WebQs
- [x] WSC273
- [x] Winogrande
- [x] ANLI
- [x] Hendrycks Ethics (missing some tasks/metrics, see PR 660: <https://github.com/EleutherAI/lm-evaluation-harness/pull/660> for more info)
- [x] TruthfulQA (mc1)
- [x] TruthfulQA (mc2)
- [x] TruthfulQA (gen)
- [x] MuTual
- [ ] Hendrycks Math (Hailey)
- [x] Asdiv
- [ ] GSM8k
- [x] Arithmetic
- [ ] MMMLU (Hailey)
- [x] Translation (WMT) suite
- [x] Unscramble
- [x] ~~Pile (perplexity)~~
- [x] BLiMP
- [x] ToxiGen
- [x] StoryCloze
- [ ] NaturalQs (Hailey)
- [x] CrowS-Pairs
- [x] XCopa
- [ ] BIG-Bench (Hailey)
- [x] XStoryCloze
- [x] XWinograd
- [x] PAWS-X
- [x] XNLI
- [x] MGSM
- [ ] SCROLLS
- [x] Babi
- [x] Belebele
-# Novel Tasks
-Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*.
-# Task Wishlist
- [ ] TheoremQA
- [ ] Theorem Proving evaluations
- [ ] Chain of Thought
- [ ] Self-consistency ; Least-to-Most prompting, etc.
- [ ] Summarization Tasks
- [ ] Anthropic Model-Written Evals
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -356,28 +356,6 @@ class TaskManager:
        return tasks_and_groups
-def include_path(task_dir):
-    logger = utils.eval_logger
-    logger.setLevel(getattr(logging, "INFO"))
-    logger.info(
-        "To still use tasks loaded from args.include_path,"
-        "see an example of the new TaskManager API in "
-        "https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
-    )
-    return 0
-def initialize_tasks(verbosity="INFO"):
-    logger = utils.eval_logger
-    logger.setLevel(getattr(logging, f"{verbosity}"))
-    logger.info(
-        "lm_eval.tasks.initialize_tasks() is deprecated and no longer necessary. "
-        "It will be removed in v0.4.2 release. "
-        "TaskManager will instead be used."
-    )
-    return 0
 def get_task_name_from_config(task_config: Dict[str, str]) -> str:
    if "task" in task_config:
        return task_config["task"]
@@ -401,7 +379,7 @@ def get_task_name_from_object(task_object):
 def get_task_dict(
-    task_name_list: List[Union[str, Dict, Task]],
+    task_name_list: Union[str, List[Union[str, Dict, Task]]],
    task_manager: Optional[TaskManager] = None,
 ):
    """Creates a dictionary of task objects from either a name of task, config, or prepared Task object.
@@ -423,6 +401,15 @@ def get_task_dict(
    if isinstance(task_name_list, str):
        task_name_list = [task_name_list]
+    elif isinstance(task_name_list, list):
+        if not all([isinstance(task, (str, dict, Task)) for task in task_name_list]):
+            raise TypeError(
+                "Expected all list items to be of types 'str', 'dict', or 'Task', but at least one entry did not match."
+            )
+    else:
+        raise TypeError(
+            f"Expected a 'str' or 'list' but received {type(task_name_list)}."
+        )
    string_task_name_list = [task for task in task_name_list if isinstance(task, str)]
    others_task_name_list = [task for task in task_name_list if ~isinstance(task, str)]

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "lm_eval"
-version = "0.4.1"
+version = "0.4.2"
 authors = [
    {name="EleutherAI", email="contact@eleuther.ai"}
 ]