readd aggregation

787b23f6 · lintangsutawika · aaf64aab · 787b23f6
Commit 787b23f6 authored Jan 02, 2024 by lintangsutawika
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 4 deletions

lm_eval/evaluator.py lm_eval/evaluator.py +3 -4

No files found.
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -449,16 +449,15 @@ def evaluate(
            else:
                group_name = None
-            metric_fn = task.compute_metric()[metric]
+            agg_fn = task.aggregation()[metric]
-            results[task_name][metric_key] = metric_fn(items)
+            results[task_name][metric_key] = agg_fn(items)
            results[task_name]["samples"] = len(items)
            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
            if bootstrap_iters > 0:
                stderr = lm_eval.api.metrics.stderr_for_metric(
-                    # metric=task.aggregation()[metric],
+                    metric=task.aggregation()[metric],
-                    metric=task.compute_metric()[metric],
                    bootstrap_iters=min(bootstrap_iters, 100)
                    if metric in ["bleu", "chrf", "ter"]
                    else bootstrap_iters,