Commit df7fee6e authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

merge with lintang-multimodal-prototyping

parents 9b9ca7bf 8db0a470
...@@ -1280,7 +1280,7 @@ class ConfigurableTask(Task): ...@@ -1280,7 +1280,7 @@ class ConfigurableTask(Task):
def doc_to_image(self, doc: Any) -> Union[int, str, list]: def doc_to_image(self, doc: Any) -> Union[int, str, list]:
if self.config.doc_to_image is None: if self.config.doc_to_image is None:
eval_logger.error("doc_to_image was called but not set in config") return None
else: else:
doc_to_image = self.config.doc_to_image doc_to_image = self.config.doc_to_image
......
...@@ -216,7 +216,15 @@ class HFMultimodalLM(HFLM): ...@@ -216,7 +216,15 @@ class HFMultimodalLM(HFLM):
### Up to here: was identical to non-multimodal HFLM generate_until ### ### Up to here: was identical to non-multimodal HFLM generate_until ###
for chunk in chunks: for idx, _chunk in enumerate(chunks):
if idx == 0:
zero_chunk = _chunk
chunk = _chunk
elif idx == 69:
chunk = zero_chunk
else:
chunk = _chunk
chunk = _chunk
contexts, all_gen_kwargs, aux_arguments = zip( contexts, all_gen_kwargs, aux_arguments = zip(
*chunk *chunk
) # TODO: can we cut down further on number of distinct things we pass around? ) # TODO: can we cut down further on number of distinct things we pass around?
......
import collections import collections
import inspect
import logging import logging
import os import os
from functools import partial from functools import partial
from typing import Dict, List, Mapping, Optional, Union from typing import Dict, List, Mapping, Optional, Union
from lm_eval import utils from lm_eval import utils
from lm_eval.api.group import ConfigurableGroup, GroupConfig
from lm_eval.api.task import ConfigurableTask, Task from lm_eval.api.task import ConfigurableTask, Task
from lm_eval.evaluator_utils import get_subtask_list
GROUP_ONLY_KEYS = list(GroupConfig().to_dict().keys())
class TaskManager: class TaskManager:
...@@ -80,7 +86,12 @@ class TaskManager: ...@@ -80,7 +86,12 @@ class TaskManager:
return False return False
def _name_is_task(self, name) -> bool: def _name_is_task(self, name) -> bool:
if self._name_is_registered(name) and ("task" in self.task_index[name]["type"]): if self._name_is_registered(name) and (self.task_index[name]["type"] == "task"):
return True
return False
def _name_is_tag(self, name) -> bool:
if self._name_is_registered(name) and (self.task_index[name]["type"] == "tag"):
return True return True
return False return False
...@@ -141,6 +152,14 @@ class TaskManager: ...@@ -141,6 +152,14 @@ class TaskManager:
config["group_alias"] = None config["group_alias"] = None
return config return config
def _class_has_config_in_constructor(self, cls):
constructor = getattr(cls, "__init__", None)
return (
"config" in inspect.signature(constructor).parameters
if constructor
else False
)
def _load_individual_task_or_group( def _load_individual_task_or_group(
self, self,
name_or_config: Optional[Union[str, dict]] = None, name_or_config: Optional[Union[str, dict]] = None,
...@@ -148,82 +167,113 @@ class TaskManager: ...@@ -148,82 +167,113 @@ class TaskManager:
update_config: Optional[dict] = None, update_config: Optional[dict] = None,
yaml_path: Optional[str] = None, yaml_path: Optional[str] = None,
) -> Mapping: ) -> Mapping:
def load_task(config, task, group=None, yaml_path=None): def _load_task(config, task, yaml_path=None):
if "include" in config: if "include" in config:
if yaml_path is None:
raise ValueError
config = { config = {
**utils.load_yaml_config( **utils.load_yaml_config(
yaml_path, yaml_path=yaml_path,
yaml_config={"include": config.pop("include")}, yaml_config={"include": config.pop("include")},
mode="full", mode="full",
), ),
**config, **config,
} }
if self._config_is_python_task(config): if self._config_is_python_task(config):
task_object = config["class"]() if self._class_has_config_in_constructor(config["class"]):
task_object = config["class"](config=config)
else:
task_object = config["class"]()
if isinstance(task_object, ConfigurableTask):
# very scuffed: set task name here. TODO: fixme?
task_object.config.task = config["task"]
else: else:
config = self._process_alias(config, group=group)
task_object = ConfigurableTask(config=config) task_object = ConfigurableTask(config=config)
if group is not None:
task_object = (group, task_object)
return {task: task_object} return {task: task_object}
def _get_group_and_subtask_from_config(config):
group_name = ConfigurableGroup(config=config)
subtask_list = []
for task in group_name.config["task"]:
if isinstance(task, str) and self._name_is_tag(task):
subtask_list.extend(self._get_tasklist(task))
else:
subtask_list.append(task)
return group_name, subtask_list
def _process_group_config(config, update_config=None):
if update_config is not None:
config = {**config, **update_config}
_update_config = {
k: v for k, v in config.items() if k not in GROUP_ONLY_KEYS
}
if not bool(_update_config):
_update_config = None
group_config = {k: v for k, v in config.items() if k in GROUP_ONLY_KEYS}
return group_config, _update_config
if isinstance(name_or_config, str): if isinstance(name_or_config, str):
if update_config is not None: if update_config is not None:
# Process name_or_config as a dict instead # Process name_or_config as a dict instead
name_or_config = {"task": name_or_config, **update_config} name_or_config = {"task": name_or_config, **update_config}
elif self._name_is_task(name_or_config): elif self._name_is_task(name_or_config) or self._name_is_python_task(
name_or_config
):
task_config = self._get_config(name_or_config) task_config = self._get_config(name_or_config)
return load_task(task_config, task=name_or_config, group=parent_name) return _load_task(task_config, task=name_or_config)
else: else:
group_name = name_or_config
subtask_list = self._get_tasklist(name_or_config) subtask_list = self._get_tasklist(name_or_config)
if subtask_list == -1: if subtask_list == -1:
group_config = self._get_config(name_or_config) group_config = self._get_config(name_or_config)
subtask_list = group_config["task"] group_config, update_config = _process_group_config(group_config)
group_name, subtask_list = _get_group_and_subtask_from_config(
# This checks if we're at the root. group_config
if parent_name is None: )
group_config = self._get_config(name_or_config) yaml_path = self._get_yaml_path(group_name.group)
if set(group_config.keys()) > {"task", "group"}: else:
update_config = { if self._name_is_tag(name_or_config):
k: v fn = partial(
for k, v in group_config.items() self._load_individual_task_or_group,
if k not in ["task", "group"] update_config=name_or_config
} if isinstance(name_or_config, dict)
yaml_path = self._get_yaml_path(group_name) else None,
)
if (update_config is not None) and ("group_alias" in update_config): return dict(
group_name = update_config["group_alias"] collections.ChainMap(*map(fn, reversed(subtask_list)))
update_config.pop("group_alias") )
else:
group_name = ConfigurableGroup(
config={"group": name_or_config, "task": subtask_list}
)
if isinstance(name_or_config, dict): if isinstance(name_or_config, dict):
if update_config is not None:
name_or_config = {
**name_or_config,
**update_config,
}
if self._config_is_task(name_or_config): if self._config_is_task(name_or_config):
# name = name_or_config.pop("task")
name = name_or_config["task"] name = name_or_config["task"]
if update_config is not None:
name_or_config = {**name_or_config, **update_config}
# If the name is registered as a group # If the name is registered as a group
# if self._name_is_task(name) is False:
if self._name_is_group(name): if self._name_is_group(name):
group_name = name group_config = self._get_config(name)
update_config = {
k: v for k, v in name_or_config.items() if k != "task" group_config, update_config = _process_group_config(
} group_config, name_or_config
)
group_name, subtask_list = _get_group_and_subtask_from_config(
group_config
)
elif self._name_is_tag(name):
subtask_list = self._get_tasklist(name) subtask_list = self._get_tasklist(name)
if subtask_list == -1: fn = partial(
subtask_list = self._get_config(name)["task"] self._load_individual_task_or_group,
update_config=name_or_config,
)
return dict(collections.ChainMap(*map(fn, reversed(subtask_list))))
else: else:
if self._name_is_registered(name): if self._name_is_registered(name):
base_task_config = self._get_config(name) base_task_config = self._get_config(name)
# Check if this is a duplicate. # Check if this is a duplicate.
if parent_name is not None: if parent_name is not None:
name_or_config["group"] = parent_name
num_duplicate = len( num_duplicate = len(
list( list(
filter( filter(
...@@ -242,22 +292,12 @@ class TaskManager: ...@@ -242,22 +292,12 @@ class TaskManager:
} }
else: else:
task_config = name_or_config task_config = name_or_config
return load_task( return _load_task(task_config, task=name, yaml_path=yaml_path)
task_config, task=name, group=parent_name, yaml_path=yaml_path
)
else: else:
group_name = name_or_config["group"] group_config, update_config = _process_group_config(name_or_config)
subtask_list = name_or_config["task"] group_name, subtask_list = _get_group_and_subtask_from_config(
if set(name_or_config.keys()) > {"task", "group"}: group_config
update_config = { )
k: v
for k, v in name_or_config.items()
if k not in ["task", "group"]
}
all_subtasks = {}
if parent_name is not None:
all_subtasks = {group_name: (parent_name, None)}
fn = partial( fn = partial(
self._load_individual_task_or_group, self._load_individual_task_or_group,
...@@ -265,11 +305,9 @@ class TaskManager: ...@@ -265,11 +305,9 @@ class TaskManager:
update_config=update_config, update_config=update_config,
yaml_path=yaml_path, yaml_path=yaml_path,
) )
all_subtasks = { return {
**all_subtasks, group_name: dict(collections.ChainMap(*map(fn, reversed(subtask_list))))
**dict(collections.ChainMap(*map(fn, subtask_list))),
} }
return all_subtasks
def load_task_or_group(self, task_list: Optional[Union[str, list]] = None) -> dict: def load_task_or_group(self, task_list: Optional[Union[str, list]] = None) -> dict:
"""Loads a dictionary of task objects from a list """Loads a dictionary of task objects from a list
...@@ -293,10 +331,11 @@ class TaskManager: ...@@ -293,10 +331,11 @@ class TaskManager:
def _get_task_and_group(self, task_dir: str): def _get_task_and_group(self, task_dir: str):
"""Creates a dictionary of tasks index with the following metadata, """Creates a dictionary of tasks index with the following metadata,
- `type`, that can be either `task`, `python_task`, or `group`. - `type`, that can be either `task`, `python_task`, `group` or `tags`.
`task` refer to regular task configs, `python_task` are special `task` refer to regular task configs, `python_task` are special
yaml files that only consists of `task` and `class` parameters. yaml files that only consists of `task` and `class` parameters.
`group` are group configs. `group` are group configs. `tags` are labels that can be assigned
to tasks to assist in sorting and calling tasks of certain themes.
- `yaml_path`, path to the yaml file. If the entry is a `group` that - `yaml_path`, path to the yaml file. If the entry is a `group` that
was configured through a task config, the yaml_path will be -1 was configured through a task config, the yaml_path will be -1
and all subtasks will be listed in `task` (see below) and all subtasks will be listed in `task` (see below)
...@@ -312,6 +351,8 @@ class TaskManager: ...@@ -312,6 +351,8 @@ class TaskManager:
:return :return
Dictionary of task names as key and task metadata Dictionary of task names as key and task metadata
""" """
# TODO: remove group in next release
print_info = True
ignore_dirs = [ ignore_dirs = [
"__pycache__", "__pycache__",
".ipynb_checkpoints", ".ipynb_checkpoints",
...@@ -358,20 +399,38 @@ class TaskManager: ...@@ -358,20 +399,38 @@ class TaskManager:
"yaml_path": yaml_path, "yaml_path": yaml_path,
} }
if "group" in config: # TODO: remove group in next release
groups = config["group"] for attr in ["tag", "group"]:
if isinstance(config["group"], str): if attr in config:
groups = [groups] if attr == "group" and print_info:
self.logger.info(
for group in groups: "`group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. "
if group not in tasks_and_groups: "`tag` will be used to allow to call a collection of tasks just like `group`. "
tasks_and_groups[group] = { "`group` will be removed in order to not cause confusion with the new ConfigurableGroup "
"type": "group", "which will be the offical way to create groups with addition of group-wide configuations."
"task": [task], )
"yaml_path": -1, print_info = False
} # attr = "tag"
else:
tasks_and_groups[group]["task"].append(task) attr_list = config[attr]
if isinstance(attr_list, str):
attr_list = [attr_list]
for tag in attr_list:
if tag not in tasks_and_groups:
tasks_and_groups[tag] = {
"type": "tag",
"task": [task],
"yaml_path": -1,
}
elif tasks_and_groups[tag]["type"] != "tag":
self.logger.info(
f"The tag {tag} is already registered as a group, this tag will not be registered. "
"This may affect tasks you want to call."
)
break
else:
tasks_and_groups[tag]["task"].append(task)
else: else:
self.logger.debug(f"File {f} in {root} could not be loaded") self.logger.debug(f"File {f} in {root} could not be loaded")
...@@ -400,6 +459,33 @@ def get_task_name_from_object(task_object): ...@@ -400,6 +459,33 @@ def get_task_name_from_object(task_object):
) )
def _check_duplicates(task_dict: dict) -> List[str]:
"""helper function solely used in validating get_task_dict output.
Takes the output of lm_eval.evaluator_utils.get_subtask_list and
returns a list of all leaf subtasks contained within, and errors if any such leaf subtasks are
"oversubscribed" to several disjoint groups.
"""
subtask_names = []
for key, value in task_dict.items():
subtask_names.extend(value)
duplicate_tasks = {
task_name for task_name in subtask_names if subtask_names.count(task_name) > 1
}
# locate the potentially problematic groups that seem to 'compete' for constituent subtasks
competing_groups = [
group
for group in task_dict.keys()
if len(set(task_dict[group]).intersection(duplicate_tasks)) > 0
]
if len(duplicate_tasks) > 0:
raise ValueError(
f"Found 1 or more tasks while trying to call get_task_dict() that were members of more than 1 called group: {list(duplicate_tasks)}. Offending groups: {competing_groups}. Please call groups which overlap their constituent tasks in separate evaluation runs."
)
def get_task_dict( def get_task_dict(
task_name_list: Union[str, List[Union[str, Dict, Task]]], task_name_list: Union[str, List[Union[str, Dict, Task]]],
task_manager: Optional[TaskManager] = None, task_manager: Optional[TaskManager] = None,
...@@ -417,6 +503,7 @@ def get_task_dict( ...@@ -417,6 +503,7 @@ def get_task_dict(
:return :return
Dictionary of task objects Dictionary of task objects
""" """
task_name_from_string_dict = {} task_name_from_string_dict = {}
task_name_from_config_dict = {} task_name_from_config_dict = {}
task_name_from_object_dict = {} task_name_from_object_dict = {}
...@@ -463,8 +550,15 @@ def get_task_dict( ...@@ -463,8 +550,15 @@ def get_task_dict(
): ):
raise ValueError raise ValueError
return { final_task_dict = {
**task_name_from_string_dict, **task_name_from_string_dict,
**task_name_from_config_dict, **task_name_from_config_dict,
**task_name_from_object_dict, **task_name_from_object_dict,
} }
# behavior can get odd if one tries to invoke several groups that "compete" for the same task.
# (notably, because one could request several num_fewshot values at once in GroupConfig overrides for the subtask
# and we'd be unsure which to use and report.)
# we explicitly check and error in this case.
_check_duplicates(get_subtask_list(final_task_dict))
return final_task_dict
group: mmmu_business group: mmmu_business
group_alias: Business group_alias: Business
task: task:
# - task: mmmu_accounting - task: mmmu_accounting
# include: _template_yaml include: _template_yaml
# task_alias: Accounting task_alias: Accounting
# dataset_name: Accounting dataset_name: Accounting
- task: mmmu_economics - task: mmmu_economics
include: _template_yaml include: _template_yaml
task_alias: Economics task_alias: Economics
dataset_name: Economics dataset_name: Economics
# - task: mmmu_finance - task: mmmu_finance
# include: _template_yaml include: _template_yaml
# task_alias: Finance task_alias: Finance
# dataset_name: Finance dataset_name: Finance
# - task: mmmu_manage - task: mmmu_manage
# include: _template_yaml include: _template_yaml
# task_alias: Manage task_alias: Manage
# dataset_name: Manage dataset_name: Manage
# - task: mmmu_marketing - task: mmmu_marketing
# include: _template_yaml include: _template_yaml
# task_alias: Marketing task_alias: Marketing
# dataset_name: Marketing dataset_name: Marketing
...@@ -361,12 +361,14 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False) ...@@ -361,12 +361,14 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
hib = HIGHER_IS_BETTER_SYMBOLS.get(higher_is_better.get(m), "") hib = HIGHER_IS_BETTER_SYMBOLS.get(higher_is_better.get(m), "")
v = "%.4f" % v if isinstance(v, float) else v
if m + "_stderr" + "," + f in dic: if m + "_stderr" + "," + f in dic:
se = dic[m + "_stderr" + "," + f] se = dic[m + "_stderr" + "," + f]
se = " N/A" if se == "N/A" else "%.4f" % se se = " N/A" if se == "N/A" else "%.4f" % se
values.append([k, version, f, n, m, hib, "%.4f" % v, "±", se]) values.append([k, version, f, n, m, hib, v, "±", se])
else: else:
values.append([k, version, f, n, m, hib, "%.4f" % v, "", ""]) values.append([k, version, f, n, m, hib, v, "", ""])
k = "" k = ""
version = "" version = ""
md_writer.value_matrix = values md_writer.value_matrix = values
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment