fix condition where a benchmark and a regular task is called together

2f0ef6ca · lintangsutawika · 32294ff1 · 2f0ef6ca
Commit 2f0ef6ca authored Aug 23, 2023 by lintangsutawika
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 5 deletions

lm_eval/evaluator.py lm_eval/evaluator.py +5 -5

No files found.
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -219,7 +219,6 @@ def evaluate(
    padding_requests = collections.defaultdict(int)

    # Stores group related keys and values for group-aggregation
-    aggregate = collections.defaultdict(dict)
    task_groups = collections.defaultdict(dict)

    # get lists of each type of request
@@ -228,6 +227,7 @@ def evaluate(
        if type(task) == tuple:
            group, task = task
            task_groups[task_name] = group
+            aggregate[task_name] = {}

        versions[task_name] = task.VERSION
        configs[task_name] = dict(task.dump_config())
@@ -407,12 +407,12 @@ def evaluate(
            #        | word_perplexity
            #        | byte_perplexity
            #        | bits_per_byte
-            if bool(task_groups):
+            if task_name in task_groups:
                group_name = task_groups[task_name]
-                if metric not in aggregate[group_name]:
-                    aggregate[group_name][metric] = [task_score]
-                else:
+                if metric in list(aggregate[group_name].keys()):
                    aggregate[group_name][metric].append(task_score)
+                else:
+                    aggregate[group_name][metric] = [task_score]

            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this