Commit 93c17c57 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

add more error msgs, agg_metric -> agg_metric_list

parent 09dd7f6c
...@@ -54,13 +54,18 @@ eval_logger = logging.getLogger("lm-eval") ...@@ -54,13 +54,18 @@ eval_logger = logging.getLogger("lm-eval")
@dataclass @dataclass
class AggMetricConfig(dict): class AggMetricConfig(dict):
metric: Optional[str] = "acc" metric: Optional[str] = None
metric_alias: Optional[str] = None
aggregation: Optional[str] = "mean" aggregation: Optional[str] = "mean"
weight_by_size: Optional[str] = False weight_by_size: Optional[str] = False
# list of filter names which should be incorporated into the aggregated metric.
filter_list: Optional[Union[str, list]] = "none" filter_list: Optional[Union[str, list]] = "none"
def __post_init__(self): def __post_init__(self):
if self.aggregation != "mean":
raise ValueError(
f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{self.aggregation}'."
)
if isinstance(self.filter_list, str): if isinstance(self.filter_list, str):
self.filter_list = [self.filter_list] self.filter_list = [self.filter_list]
...@@ -70,8 +75,7 @@ class GroupConfig(dict): ...@@ -70,8 +75,7 @@ class GroupConfig(dict):
group: Optional[str] = None group: Optional[str] = None
group_alias: Optional[str] = None group_alias: Optional[str] = None
task: Optional[Union[str, list]] = None task: Optional[Union[str, list]] = None
tag_to_task: Optional[str] = False aggregate_metric_list: Optional[
aggregate_metric: Optional[
Union[List[AggMetricConfig], AggMetricConfig, dict] Union[List[AggMetricConfig], AggMetricConfig, dict]
] = None ] = None
metadata: Optional[ metadata: Optional[
...@@ -85,13 +89,13 @@ class GroupConfig(dict): ...@@ -85,13 +89,13 @@ class GroupConfig(dict):
return setattr(self, item, value) return setattr(self, item, value)
def __post_init__(self): def __post_init__(self):
if self.aggregate_metric is not None: if self.aggregate_metric_list is not None:
if isinstance(self.aggregate_metric, dict): if isinstance(self.aggregate_metric_list, dict):
self.aggregate_metric = [self.aggregate_metric] self.aggregate_metric_list = [self.aggregate_metric_list]
self.aggregate_metric = [ self.aggregate_metric_list = [
AggMetricConfig(**item) if isinstance(item, dict) else item AggMetricConfig(**item) if isinstance(item, dict) else item
for item in self.aggregate_metric for item in self.aggregate_metric_list
] ]
def to_dict(self, keep_callable: bool = False) -> dict: def to_dict(self, keep_callable: bool = False) -> dict:
...@@ -213,6 +217,18 @@ class TaskConfig(dict): ...@@ -213,6 +217,18 @@ class TaskConfig(dict):
] = None # by default, not used in the code. allows for users to pass arbitrary info to tasks ] = None # by default, not used in the code. allows for users to pass arbitrary info to tasks
def __post_init__(self) -> None: def __post_init__(self) -> None:
if self.group is not None:
eval_logger.warning(
"A task YAML file was found to contain a `group` key. Groups which provide aggregate scores over several subtasks now require a separate config file--if not aggregating, you may want to use the `tag` config option instead within your config. Setting `group` within a TaskConfig will be deprecated in v0.4.4. Please see https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md for more information."
)
if self.tag is None:
self.tag = self.group
else:
raise ValueError(
"Got both a `group` and `tag` entry within a TaskConfig. Please use one or the other--`group` values will be deprecated in v0.4.4."
)
if self.generation_kwargs is not None: if self.generation_kwargs is not None:
if self.output_type != "generate_until": if self.output_type != "generate_until":
eval_logger.warning( eval_logger.warning(
......
...@@ -616,16 +616,16 @@ def evaluate( ...@@ -616,16 +616,16 @@ def evaluate(
) )
if (group_config is None) or ( if (group_config is None) or (
group_config["aggregate_metric"] is None group_config["aggregate_metric_list"] is None
): ):
results[group_or_task][" "] = " " results[group_or_task][" "] = " "
continue continue
if "aggregate_metric" in group_config: if "aggregate_metric_list" in group_config:
agg_metric_list = group_config["aggregate_metric"] agg_metric_list = group_config["aggregate_metric_list"]
show_group_table = show_group_table | bool( show_group_table = show_group_table | bool(
group_config["aggregate_metric"] group_config["aggregate_metric_list"]
) )
task_list = _task_aggregation_list[group_or_task] task_list = _task_aggregation_list[group_or_task]
...@@ -660,9 +660,9 @@ def evaluate( ...@@ -660,9 +660,9 @@ def evaluate(
] ]
for metric_config in agg_metric_list: for metric_config in agg_metric_list:
for filter in metric_config["filter_list"]: for filter_name in metric_config["filter_list"]:
if metric != ",".join( if metric != ",".join(
[metric_config["metric"], filter] [metric_config["metric"], filter_name]
): ):
continue continue
...@@ -670,25 +670,25 @@ def evaluate( ...@@ -670,25 +670,25 @@ def evaluate(
if metric_config["aggregation"] == "mean": if metric_config["aggregation"] == "mean":
aggregate_fn = lm_eval.api.metrics.aggregate_subtask_metrics aggregate_fn = lm_eval.api.metrics.aggregate_subtask_metrics
else: else:
aggregate_fn = metric_config["aggregation"] raise ValueError(
f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'"
)
results[group_or_task][metric] = aggregate_fn( results[group_or_task][metric] = aggregate_fn(
metrics, metrics,
sizes, sizes,
metric_config["weight_by_size"], metric_config["weight_by_size"],
) )
# TODO: calculate grouped metric using aggregation fn # TODO: calculate groups' metrics using arbitrary agg fns
if "N/A" in stderrs: if "N/A" in stderrs:
results[group_or_task][stderr] = "N/A" results[group_or_task][stderr] = "N/A"
else: else:
# TODO: put in a warning, if we are using non-micro avg mean or another aggregation fn
results[group_or_task][ results[group_or_task][
stderr stderr
] = lm_eval.api.metrics.pooled_sample_stderr( ] = lm_eval.api.metrics.pooled_sample_stderr(
stderrs, sizes stderrs, sizes
) )
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
results[group_or_task]["samples"] = sum(sizes) results[group_or_task]["samples"] = sum(sizes)
group_metadata = group_config.get("metadata", None) group_metadata = group_config.get("metadata", None)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment