default to weighted averaging

1dc8f96f · lintangsutawika · 92f25463 · 1dc8f96f
Commit 1dc8f96f authored Oct 16, 2023 by lintangsutawika
Hide whitespace changes
Inline Side-by-side

Showing with 48 additions and 25 deletions

lm_eval/evaluator.py lm_eval/evaluator.py +48 -25

No files found.
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -219,6 +219,7 @@ def evaluate(
    padding_requests = collections.defaultdict(int)
    # store the hierarchy to do proper ordering
    task_hierarchy = collections.defaultdict(list)
+    group_hierarchy = collections.defaultdict(list)
    # store the ordering of tasks and groups
    task_order = collections.defaultdict(int)
    # store the aggregation for aggregating across tasks in the same group
@@ -450,22 +451,26 @@ def evaluate(
            agg_fn = task.aggregation()[metric]
            task_score = agg_fn(items)
+            task_size = len(items)
-            if group_name is not None:
-                sample_metric_key = metric + "(sample agg)," + key
+            # if group_name is not None:
-                for grouping in task_to_group[task_name]:
+            #     sample_metric_key = metric + "(sample agg)," + key
-                    if metric_key in results[grouping]:
+            #     for grouping in task_to_group[task_name]:
-                        results[grouping][metric_key].append(task_score)
+            #         if metric_key in results[grouping]:
-                    else:
+            #             results[grouping][metric_key].append(task_score)
-                        results[grouping][metric_key] = [task_score]
+            #             results[grouping]["size"].append(task_size)
+            #         else:
-                    if sample_metric_key in results[grouping]:
+            #             results[grouping][metric_key] = [task_score]
-                        results[grouping][sample_metric_key] += items
+            #             results[grouping]["size"] = [task_size]
-                    else:
-                        results[grouping][sample_metric_key] = items.copy()
+            #         if sample_metric_key in results[grouping]:
-                        sample_agg_fn[grouping][sample_metric_key] = agg_fn
+            #             results[grouping][sample_metric_key] += items
+            #         else:
+            #             results[grouping][sample_metric_key] = items.copy()
+            #             sample_agg_fn[grouping][sample_metric_key] = agg_fn
            results[task_name][metric_key] = task_score
+            results[task_name]["size"] = task_size
            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
@@ -481,18 +486,36 @@ def evaluate(
                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)
        if bool(results):
-            for task_or_group in results.keys():
-                for metric in results[task_or_group].keys():
+            for group, task_list in reversed(task_hierarchy.items()):
-                    if type(results[task_or_group][metric]) == list:
+                versions[group] = "N/A"
-                        if "(sample agg)" in metric:
+                task_score_dict = {}
-                            results[task_or_group][metric] = sample_agg_fn[
+                total_size = 0
-                                task_or_group
+                for task in task_list:
-                            ][metric](results[task_or_group][metric])
+                    metrics = results[task]
+                    if "size" in metrics:
+                        current_size = metrics.pop("size")
+                    else:
+                        current_size = 1
+                    for metric in [key for key in metrics.keys()]:
+                        if "_stderr" in metric:
+                            print(metric)
+                        metric_score = results[task][metric]
+                        if metric in results[group]:
+                            results[group][metric] = (results[group][metric]*total_size + metric_score*current_size)/(total_size+current_size)
                        else:
-                            results[task_or_group][metric] = np.average(
+                            results[group][metric] = metric_score
-                                results[task_or_group][metric]
-                            )
+                    # Different formula for agg stderr
-                        versions[task_or_group] = "N/A"
+                    total_size += current_size
        for task_name, task in task_dict.items():
            if type(task) == tuple: