Merge pull request #802 from EleutherAI/fix-metrics

Merge Fix metrics branch

Merge pull request #802 from EleutherAI/fix-metrics
Merge Fix metrics branch
a005aeba · Lintang Sutawika · GitHub · c01d5bac · f5d0f8e6 · a005aeba
Unverified Commit a005aeba authored Aug 24, 2023 by Lintang Sutawika Committed by GitHub Aug 24, 2023
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 9 deletions

ignore.txt ignore.txt +1 -1

lm_eval/api/task.py lm_eval/api/task.py +3 -3

lm_eval/evaluator.py lm_eval/evaluator.py +5 -5

No files found.
--- a/ignore.txt
+++ b/ignore.txt
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -659,14 +659,14 @@ class ConfigurableTask(Task):
            self.multiple_target = len(test_target)
        else:
            if (type(test_target) is int) and (test_choice is not None):
-                test_target = [self.doc_to_choice(test_target)[test_target]]
+                test_target = test_choice[test_target]
            else:
-                test_target = [test_target]
+                test_target = str(test_target)
        if test_choice is not None:
            check_choices = test_choice
        else:
-            check_choices = test_target
+            check_choices = [test_target]
        for choice in check_choices:
            choice_has_whitespace = True if " " in choice else False

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -219,7 +219,6 @@ def evaluate(
    padding_requests = collections.defaultdict(int)
    # Stores group related keys and values for group-aggregation
-    aggregate = collections.defaultdict(dict)
    task_groups = collections.defaultdict(dict)
    # get lists of each type of request
@@ -228,6 +227,7 @@ def evaluate(
        if type(task) == tuple:
            group, task = task
            task_groups[task_name] = group
+            aggregate[task_name] = {}
        versions[task_name] = task.VERSION
        configs[task_name] = dict(task.dump_config())
@@ -407,12 +407,12 @@ def evaluate(
            #        | word_perplexity
            #        | byte_perplexity
            #        | bits_per_byte
-            if bool(task_groups):
+            if task_name in task_groups:
                group_name = task_groups[task_name]
-                if metric not in aggregate[group_name]:
+                if metric in list(aggregate[group_name].keys()):
-                    aggregate[group_name][metric] = [task_score]
-                else:
                    aggregate[group_name][metric].append(task_score)
+                else:
+                    aggregate[group_name][metric] = [task_score]
            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this