add condition if --task is not a benchmark

2d96a8c8 · lintangsutawika · ed304c1d · 2d96a8c8
Commit 2d96a8c8 authored Jul 24, 2023 by lintangsutawika
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 10 deletions

lm_eval/evaluator.py lm_eval/evaluator.py +12 -10

No files found.
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -398,6 +398,7 @@ def evaluate(
            #        | word_perplexity
            #        | byte_perplexity
            #        | bits_per_byte
+            if bool(task_groups):
                group_name = task_groups[task_name]
                if metric not in aggregate[group_name]:
                    aggregate[group_name][metric] = [task_score]
@@ -417,6 +418,7 @@ def evaluate(
                if stderr is not None:
                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)
+        if not bool(aggregate):
            for group in aggregate.keys():
                for metric in aggregate[group].keys():
                    aggregate[group][metric] = np.average(aggregate[group][metric])
@@ -424,7 +426,7 @@ def evaluate(
        results_dict = {
            "results": dict(results),
-            "aggregate": dict(aggregate),
+            **({"aggregate": dict(aggregate)} if bool(aggregate) else {}),
            "configs": dict(configs),
            "versions": dict(versions),
        }