Update evaluator.py

04dde62c · Lintang Sutawika · GitHub · 8680e938 · 04dde62c
Unverified Commit 04dde62c authored Feb 20, 2024 by Lintang Sutawika Committed by GitHub Feb 20, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 26 additions and 8 deletions

lm_eval/evaluator.py lm_eval/evaluator.py +26 -8

No files found.
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -524,19 +524,37 @@ def evaluate(
                    # or `task_name: []`.
                    # we only want to operate on groups here.
                    continue
-                for metric in [
-                    key
-                    for key in results[task_list[0]].keys()
-                    if "_stderr" not in key and key not in ["alias", "samples"]
-                ]:  # TODO: what if tasks don't all share the same metrics
+
+                group_metrics = list(
+                    dict.fromkeys(
+                        [
+                            key
+                            for task in task_list
+                            for key in results[task].keys()
+                            if "_stderr" not in key and key not in ["alias", "samples"]
+                        ]
+                    )
+                )
+                for metric in group_metrics:
+                    # TODO: what if tasks don't all share the same metrics
                    stderr = "_stderr,".join(metric.split(","))

                    # gather metrics, sizes, and stderrs from subtasks
                    metrics = [
-                        results[task][metric] for task in task_list
+                        results[task][metric]
+                        for task in task_list
+                        if metric in results[task]
                    ]  # TODO: copy?
-                    stderrs = [results[task][stderr] for task in task_list]
-                    sizes = [results[task]["samples"] for task in task_list]
+                    stderrs = [
+                        results[task][stderr]
+                        for task in task_list
+                        if stderr in results[task]
+                    ]
+                    sizes = [
+                        results[task]["samples"]
+                        for task in task_list
+                        if metric in results[task]
+                    ]

                    # compute group's pooled metric and stderr
                    results[group][