"vscode:/vscode.git/clone" did not exist on "e748d7853daa5a0dcb9afc0a7581af148ab2fc3c"
Unverified Commit 5627e819 authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Cleanup for v0.4.2 release (#1573)

* Update interface.md

* fix: make caching reqs always work with accelerate launch

* remove stale task migration checklist

* remove deprecation warnings

* make informative TypeErrors for get_task_dict

* bump version metadata

* fix num_fewshot printing bug

* add fewshot value to cache key
parent 6fae67a6
...@@ -112,8 +112,8 @@ my_model = initialize_my_model() ...@@ -112,8 +112,8 @@ my_model = initialize_my_model()
# - `Your_LM.generate_until()` # - `Your_LM.generate_until()`
lm_obj = Your_LM(model=my_model, batch_size=16) lm_obj = Your_LM(model=my_model, batch_size=16)
# The task_manager indexes tasks including ones # optional: the task_manager indexes tasks including ones
# specified by the user through `include_path` # specified by the user through `include_path`.
task_manager = lm_eval.tasks.TaskManager( task_manager = lm_eval.tasks.TaskManager(
include_path="/path/to/custom/yaml" include_path="/path/to/custom/yaml"
) )
...@@ -138,9 +138,9 @@ task_dict = lm_eval.tasks.get_task_dict( ...@@ -138,9 +138,9 @@ task_dict = lm_eval.tasks.get_task_dict(
# custom paths is required. # custom paths is required.
) )
def evaluate( results = evaluate(
lm=lm_obj, lm=lm_obj,
task_dict=task_dict, task_dict=task_dict,
... ...
): )
``` ```
...@@ -13,7 +13,7 @@ import numpy as np ...@@ -13,7 +13,7 @@ import numpy as np
from lm_eval import evaluator, utils from lm_eval import evaluator, utils
from lm_eval.evaluator import request_caching_arg_to_dict from lm_eval.evaluator import request_caching_arg_to_dict
from lm_eval.logging_utils import WandbLogger from lm_eval.logging_utils import WandbLogger
from lm_eval.tasks import TaskManager, include_path, initialize_tasks from lm_eval.tasks import TaskManager
from lm_eval.utils import make_table, simple_parse_args_string from lm_eval.utils import make_table, simple_parse_args_string
...@@ -258,7 +258,8 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -258,7 +258,8 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
"Specify --output_path if providing --log_samples or --predict_only" "Specify --output_path if providing --log_samples or --predict_only"
) )
initialize_tasks(args.verbosity) if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
task_manager = TaskManager(args.verbosity, include_path=args.include_path) task_manager = TaskManager(args.verbosity, include_path=args.include_path)
if args.limit: if args.limit:
...@@ -266,9 +267,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -266,9 +267,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
" --limit SHOULD ONLY BE USED FOR TESTING." " --limit SHOULD ONLY BE USED FOR TESTING."
"REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
) )
if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
include_path(args.include_path)
if args.tasks is None: if args.tasks is None:
eval_logger.error("Need to specify task to evaluate.") eval_logger.error("Need to specify task to evaluate.")
......
...@@ -376,7 +376,7 @@ class Task(abc.ABC): ...@@ -376,7 +376,7 @@ class Task(abc.ABC):
# used with caching # used with caching
og_limit = limit og_limit = limit
cache_key = f"requests-{self._config.task}" cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}"
cached_instances = load_from_cache(file_name=cache_key) cached_instances = load_from_cache(file_name=cache_key)
......
...@@ -189,10 +189,6 @@ def simple_evaluate( ...@@ -189,10 +189,6 @@ def simple_evaluate(
if task_manager is None: if task_manager is None:
task_manager = TaskManager(verbosity) task_manager = TaskManager(verbosity)
eval_logger.info(
"get_task_dict has been updated to accept an optional argument, `task_manager`"
"Read more here:https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
)
task_dict = get_task_dict(tasks, task_manager) task_dict = get_task_dict(tasks, task_manager)
for task_name in task_dict.keys(): for task_name in task_dict.keys():
task_obj = task_dict[task_name] task_obj = task_dict[task_name]
...@@ -215,6 +211,8 @@ def simple_evaluate( ...@@ -215,6 +211,8 @@ def simple_evaluate(
# we have to change the class properties post-hoc. This is pretty hacky. # we have to change the class properties post-hoc. This is pretty hacky.
task_obj.override_metric(metric_name="bypass") task_obj.override_metric(metric_name="bypass")
# override tasks' fewshot values to the provided num_fewshot arg value
# except if tasks have it set to 0 manually in their configs--then we should never overwrite that
if num_fewshot is not None: if num_fewshot is not None:
if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0: if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
eval_logger.info( eval_logger.info(
...@@ -225,6 +223,10 @@ def simple_evaluate( ...@@ -225,6 +223,10 @@ def simple_evaluate(
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}" f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
) )
task_obj.set_config(key="num_fewshot", value=num_fewshot) task_obj.set_config(key="num_fewshot", value=num_fewshot)
else:
# if num_fewshot not provided, and the task does not define a default one, default to 0
if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
task_obj.set_config(key="num_fewshot", value=0)
if check_integrity: if check_integrity:
run_task_tests(task_list=tasks) run_task_tests(task_list=tasks)
......
# v1.0 Tasks
This list keeps track of which tasks' implementations have been ported to YAML / v2.0 of the Eval Harness.
Boxes should be checked iff tasks are implemented in the refactor and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation. (WIP) Denotes that there exists a PR or person working on this task already.
- [x] Glue
- [x] SuperGlue
- [x] CoQA
- [x] DROP
- [x] ~~Lambada~~
- [x] Lambada (Cloze variants)
- [x] ~~Lambada (Multilingual)~~
- [x] Wikitext
- [x] PiQA
- [x] PROST
- [x] MCTACO
- [x] Pubmed QA
- [x] SciQ
- [x] QASPER
- [x] QA4MRE
- [x] TriviaQA
- [x] AI2 ARC
- [x] LogiQA
- [x] HellaSwag
- [x] SWAG
- [x] OpenBookQA
- [ ] SQuADv2 (Lintang)
- [x] RACE
- [x] HeadQA
- [x] MathQA
- [x] WebQs
- [x] WSC273
- [x] Winogrande
- [x] ANLI
- [x] Hendrycks Ethics (missing some tasks/metrics, see PR 660: <https://github.com/EleutherAI/lm-evaluation-harness/pull/660> for more info)
- [x] TruthfulQA (mc1)
- [x] TruthfulQA (mc2)
- [x] TruthfulQA (gen)
- [x] MuTual
- [ ] Hendrycks Math (Hailey)
- [x] Asdiv
- [ ] GSM8k
- [x] Arithmetic
- [ ] MMMLU (Hailey)
- [x] Translation (WMT) suite
- [x] Unscramble
- [x] ~~Pile (perplexity)~~
- [x] BLiMP
- [x] ToxiGen
- [x] StoryCloze
- [ ] NaturalQs (Hailey)
- [x] CrowS-Pairs
- [x] XCopa
- [ ] BIG-Bench (Hailey)
- [x] XStoryCloze
- [x] XWinograd
- [x] PAWS-X
- [x] XNLI
- [x] MGSM
- [ ] SCROLLS
- [x] Babi
- [x] Belebele
# Novel Tasks
Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*.
# Task Wishlist
- [ ] TheoremQA
- [ ] Theorem Proving evaluations
- [ ] Chain of Thought
- [ ] Self-consistency ; Least-to-Most prompting, etc.
- [ ] Summarization Tasks
- [ ] Anthropic Model-Written Evals
...@@ -356,28 +356,6 @@ class TaskManager: ...@@ -356,28 +356,6 @@ class TaskManager:
return tasks_and_groups return tasks_and_groups
def include_path(task_dir):
logger = utils.eval_logger
logger.setLevel(getattr(logging, "INFO"))
logger.info(
"To still use tasks loaded from args.include_path,"
"see an example of the new TaskManager API in "
"https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
)
return 0
def initialize_tasks(verbosity="INFO"):
logger = utils.eval_logger
logger.setLevel(getattr(logging, f"{verbosity}"))
logger.info(
"lm_eval.tasks.initialize_tasks() is deprecated and no longer necessary. "
"It will be removed in v0.4.2 release. "
"TaskManager will instead be used."
)
return 0
def get_task_name_from_config(task_config: Dict[str, str]) -> str: def get_task_name_from_config(task_config: Dict[str, str]) -> str:
if "task" in task_config: if "task" in task_config:
return task_config["task"] return task_config["task"]
...@@ -401,7 +379,7 @@ def get_task_name_from_object(task_object): ...@@ -401,7 +379,7 @@ def get_task_name_from_object(task_object):
def get_task_dict( def get_task_dict(
task_name_list: List[Union[str, Dict, Task]], task_name_list: Union[str, List[Union[str, Dict, Task]]],
task_manager: Optional[TaskManager] = None, task_manager: Optional[TaskManager] = None,
): ):
"""Creates a dictionary of task objects from either a name of task, config, or prepared Task object. """Creates a dictionary of task objects from either a name of task, config, or prepared Task object.
...@@ -423,6 +401,15 @@ def get_task_dict( ...@@ -423,6 +401,15 @@ def get_task_dict(
if isinstance(task_name_list, str): if isinstance(task_name_list, str):
task_name_list = [task_name_list] task_name_list = [task_name_list]
elif isinstance(task_name_list, list):
if not all([isinstance(task, (str, dict, Task)) for task in task_name_list]):
raise TypeError(
"Expected all list items to be of types 'str', 'dict', or 'Task', but at least one entry did not match."
)
else:
raise TypeError(
f"Expected a 'str' or 'list' but received {type(task_name_list)}."
)
string_task_name_list = [task for task in task_name_list if isinstance(task, str)] string_task_name_list = [task for task in task_name_list if isinstance(task, str)]
others_task_name_list = [task for task in task_name_list if ~isinstance(task, str)] others_task_name_list = [task for task in task_name_list if ~isinstance(task, str)]
......
...@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" ...@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "lm_eval" name = "lm_eval"
version = "0.4.1" version = "0.4.2"
authors = [ authors = [
{name="EleutherAI", email="contact@eleuther.ai"} {name="EleutherAI", email="contact@eleuther.ai"}
] ]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment