add more error msgs, agg_metric -> agg_metric_list

93c17c57 · haileyschoelkopf · 09dd7f6c · 93c17c57 · 93c17c57
Commit 93c17c57 authored Jun 25, 2024 by haileyschoelkopf
Hide whitespace changes
Inline Side-by-side

Showing with 36 additions and 20 deletions

lm_eval/api/task.py lm_eval/api/task.py +25 -9

lm_eval/evaluator.py lm_eval/evaluator.py +11 -11

No files found.
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -54,13 +54,18 @@ eval_logger = logging.getLogger("lm-eval")
 @dataclass
 class AggMetricConfig(dict):
-    metric: Optional[str] = "acc"
+    metric: Optional[str] = None
-    metric_alias: Optional[str] = None
    aggregation: Optional[str] = "mean"
    weight_by_size: Optional[str] = False
+    # list of filter names which should be incorporated into the aggregated metric.
    filter_list: Optional[Union[str, list]] = "none"
    def __post_init__(self):
+        if self.aggregation != "mean":
+            raise ValueError(
+                f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{self.aggregation}'."
+            )
        if isinstance(self.filter_list, str):
            self.filter_list = [self.filter_list]
@@ -70,8 +75,7 @@ class GroupConfig(dict):
    group: Optional[str] = None
    group_alias: Optional[str] = None
    task: Optional[Union[str, list]] = None
-    tag_to_task: Optional[str] = False
+    aggregate_metric_list: Optional[
-    aggregate_metric: Optional[
        Union[List[AggMetricConfig], AggMetricConfig, dict]
    ] = None
    metadata: Optional[
@@ -85,13 +89,13 @@ class GroupConfig(dict):
        return setattr(self, item, value)
    def __post_init__(self):
-        if self.aggregate_metric is not None:
+        if self.aggregate_metric_list is not None:
-            if isinstance(self.aggregate_metric, dict):
+            if isinstance(self.aggregate_metric_list, dict):
-                self.aggregate_metric = [self.aggregate_metric]
+                self.aggregate_metric_list = [self.aggregate_metric_list]
-            self.aggregate_metric = [
+            self.aggregate_metric_list = [
                AggMetricConfig(**item) if isinstance(item, dict) else item
-                for item in self.aggregate_metric
+                for item in self.aggregate_metric_list
            ]
    def to_dict(self, keep_callable: bool = False) -> dict:
@@ -213,6 +217,18 @@ class TaskConfig(dict):
    ] = None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
    def __post_init__(self) -> None:
+        if self.group is not None:
+            eval_logger.warning(
+                "A task YAML file was found to contain a `group` key. Groups which provide aggregate scores over several subtasks now require a separate config file--if not aggregating, you may want to use the `tag` config option instead within your config. Setting `group` within a TaskConfig will be deprecated in v0.4.4. Please see https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md for more information."
+            )
+            if self.tag is None:
+                self.tag = self.group
+            else:
+                raise ValueError(
+                    "Got both a `group` and `tag` entry within a TaskConfig. Please use one or the other--`group` values will be deprecated in v0.4.4."
+                )
        if self.generation_kwargs is not None:
            if self.output_type != "generate_until":
                eval_logger.warning(

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -616,16 +616,16 @@ def evaluate(
                            )
                        if (group_config is None) or (
-                            group_config["aggregate_metric"] is None
+                            group_config["aggregate_metric_list"] is None
                        ):
                            results[group_or_task][" "] = " "
                            continue
-                        if "aggregate_metric" in group_config:
+                        if "aggregate_metric_list" in group_config:
-                            agg_metric_list = group_config["aggregate_metric"]
+                            agg_metric_list = group_config["aggregate_metric_list"]
                        show_group_table = show_group_table | bool(
-                            group_config["aggregate_metric"]
+                            group_config["aggregate_metric_list"]
                        )
                        task_list = _task_aggregation_list[group_or_task]
@@ -660,9 +660,9 @@ def evaluate(
                            ]
                            for metric_config in agg_metric_list:
-                                for filter in metric_config["filter_list"]:
+                                for filter_name in metric_config["filter_list"]:
                                    if metric != ",".join(
-                                        [metric_config["metric"], filter]
+                                        [metric_config["metric"], filter_name]
                                    ):
                                        continue
@@ -670,25 +670,25 @@ def evaluate(
                                    if metric_config["aggregation"] == "mean":
                                        aggregate_fn = lm_eval.api.metrics.aggregate_subtask_metrics
                                    else:
-                                        aggregate_fn = metric_config["aggregation"]
+                                        raise ValueError(
+                                            f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'"
+                                        )
                                    results[group_or_task][metric] = aggregate_fn(
                                        metrics,
                                        sizes,
                                        metric_config["weight_by_size"],
                                    )
-                                    # TODO: calculate grouped metric using aggregation fn
+                                    # TODO: calculate groups' metrics using arbitrary agg fns
                                    if "N/A" in stderrs:
                                        results[group_or_task][stderr] = "N/A"
                                    else:
+                                        # TODO: put in a warning, if we are using non-micro avg mean or another aggregation fn
                                        results[group_or_task][
                                            stderr
                                        ] = lm_eval.api.metrics.pooled_sample_stderr(
                                            stderrs, sizes
                                        )
-                                        # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
-                                        # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
-                                        # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
                            results[group_or_task]["samples"] = sum(sizes)
                            group_metadata = group_config.get("metadata", None)