Unverified Commit 5627e819 authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Cleanup for v0.4.2 release (#1573)

* Update interface.md

* fix: make caching reqs always work with accelerate launch

* remove stale task migration checklist

* remove deprecation warnings

* make informative TypeErrors for get_task_dict

* bump version metadata

* fix num_fewshot printing bug

* add fewshot value to cache key
parent 6fae67a6
......@@ -112,8 +112,8 @@ my_model = initialize_my_model()
# - `Your_LM.generate_until()`
lm_obj = Your_LM(model=my_model, batch_size=16)
# The task_manager indexes tasks including ones
# specified by the user through `include_path`
# optional: the task_manager indexes tasks including ones
# specified by the user through `include_path`.
task_manager = lm_eval.tasks.TaskManager(
include_path="/path/to/custom/yaml"
)
......@@ -138,9 +138,9 @@ task_dict = lm_eval.tasks.get_task_dict(
# custom paths is required.
)
def evaluate(
results = evaluate(
lm=lm_obj,
task_dict=task_dict,
...
):
)
```
......@@ -13,7 +13,7 @@ import numpy as np
from lm_eval import evaluator, utils
from lm_eval.evaluator import request_caching_arg_to_dict
from lm_eval.logging_utils import WandbLogger
from lm_eval.tasks import TaskManager, include_path, initialize_tasks
from lm_eval.tasks import TaskManager
from lm_eval.utils import make_table, simple_parse_args_string
......@@ -258,7 +258,8 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
"Specify --output_path if providing --log_samples or --predict_only"
)
initialize_tasks(args.verbosity)
if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
task_manager = TaskManager(args.verbosity, include_path=args.include_path)
if args.limit:
......@@ -266,9 +267,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
" --limit SHOULD ONLY BE USED FOR TESTING."
"REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
)
if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
include_path(args.include_path)
if args.tasks is None:
eval_logger.error("Need to specify task to evaluate.")
......
......@@ -376,7 +376,7 @@ class Task(abc.ABC):
# used with caching
og_limit = limit
cache_key = f"requests-{self._config.task}"
cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}"
cached_instances = load_from_cache(file_name=cache_key)
......
......@@ -189,10 +189,6 @@ def simple_evaluate(
if task_manager is None:
task_manager = TaskManager(verbosity)
eval_logger.info(
"get_task_dict has been updated to accept an optional argument, `task_manager`"
"Read more here:https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
)
task_dict = get_task_dict(tasks, task_manager)
for task_name in task_dict.keys():
task_obj = task_dict[task_name]
......@@ -215,6 +211,8 @@ def simple_evaluate(
# we have to change the class properties post-hoc. This is pretty hacky.
task_obj.override_metric(metric_name="bypass")
# override tasks' fewshot values to the provided num_fewshot arg value
# except if tasks have it set to 0 manually in their configs--then we should never overwrite that
if num_fewshot is not None:
if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
eval_logger.info(
......@@ -225,6 +223,10 @@ def simple_evaluate(
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
)
task_obj.set_config(key="num_fewshot", value=num_fewshot)
else:
# if num_fewshot not provided, and the task does not define a default one, default to 0
if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
task_obj.set_config(key="num_fewshot", value=0)
if check_integrity:
run_task_tests(task_list=tasks)
......
# v1.0 Tasks
This list keeps track of which tasks' implementations have been ported to YAML / v2.0 of the Eval Harness.
Boxes should be checked iff tasks are implemented in the refactor and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation. (WIP) Denotes that there exists a PR or person working on this task already.
- [x] Glue
- [x] SuperGlue
- [x] CoQA
- [x] DROP
- [x] ~~Lambada~~
- [x] Lambada (Cloze variants)
- [x] ~~Lambada (Multilingual)~~
- [x] Wikitext
- [x] PiQA
- [x] PROST
- [x] MCTACO
- [x] Pubmed QA
- [x] SciQ
- [x] QASPER
- [x] QA4MRE
- [x] TriviaQA
- [x] AI2 ARC
- [x] LogiQA
- [x] HellaSwag
- [x] SWAG
- [x] OpenBookQA
- [ ] SQuADv2 (Lintang)
- [x] RACE
- [x] HeadQA
- [x] MathQA
- [x] WebQs
- [x] WSC273
- [x] Winogrande
- [x] ANLI
- [x] Hendrycks Ethics (missing some tasks/metrics, see PR 660: <https://github.com/EleutherAI/lm-evaluation-harness/pull/660> for more info)
- [x] TruthfulQA (mc1)
- [x] TruthfulQA (mc2)
- [x] TruthfulQA (gen)
- [x] MuTual
- [ ] Hendrycks Math (Hailey)
- [x] Asdiv
- [ ] GSM8k
- [x] Arithmetic
- [ ] MMMLU (Hailey)
- [x] Translation (WMT) suite
- [x] Unscramble
- [x] ~~Pile (perplexity)~~
- [x] BLiMP
- [x] ToxiGen
- [x] StoryCloze
- [ ] NaturalQs (Hailey)
- [x] CrowS-Pairs
- [x] XCopa
- [ ] BIG-Bench (Hailey)
- [x] XStoryCloze
- [x] XWinograd
- [x] PAWS-X
- [x] XNLI
- [x] MGSM
- [ ] SCROLLS
- [x] Babi
- [x] Belebele
# Novel Tasks
Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*.
# Task Wishlist
- [ ] TheoremQA
- [ ] Theorem Proving evaluations
- [ ] Chain of Thought
- [ ] Self-consistency ; Least-to-Most prompting, etc.
- [ ] Summarization Tasks
- [ ] Anthropic Model-Written Evals
......@@ -356,28 +356,6 @@ class TaskManager:
return tasks_and_groups
def include_path(task_dir):
logger = utils.eval_logger
logger.setLevel(getattr(logging, "INFO"))
logger.info(
"To still use tasks loaded from args.include_path,"
"see an example of the new TaskManager API in "
"https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
)
return 0
def initialize_tasks(verbosity="INFO"):
logger = utils.eval_logger
logger.setLevel(getattr(logging, f"{verbosity}"))
logger.info(
"lm_eval.tasks.initialize_tasks() is deprecated and no longer necessary. "
"It will be removed in v0.4.2 release. "
"TaskManager will instead be used."
)
return 0
def get_task_name_from_config(task_config: Dict[str, str]) -> str:
if "task" in task_config:
return task_config["task"]
......@@ -401,7 +379,7 @@ def get_task_name_from_object(task_object):
def get_task_dict(
task_name_list: List[Union[str, Dict, Task]],
task_name_list: Union[str, List[Union[str, Dict, Task]]],
task_manager: Optional[TaskManager] = None,
):
"""Creates a dictionary of task objects from either a name of task, config, or prepared Task object.
......@@ -423,6 +401,15 @@ def get_task_dict(
if isinstance(task_name_list, str):
task_name_list = [task_name_list]
elif isinstance(task_name_list, list):
if not all([isinstance(task, (str, dict, Task)) for task in task_name_list]):
raise TypeError(
"Expected all list items to be of types 'str', 'dict', or 'Task', but at least one entry did not match."
)
else:
raise TypeError(
f"Expected a 'str' or 'list' but received {type(task_name_list)}."
)
string_task_name_list = [task for task in task_name_list if isinstance(task, str)]
others_task_name_list = [task for task in task_name_list if ~isinstance(task, str)]
......
......@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "lm_eval"
version = "0.4.1"
version = "0.4.2"
authors = [
{name="EleutherAI", email="contact@eleuther.ai"}
]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment