added stderr reprocessing for groups

00859825 · lintangsutawika · 1dc8f96f · 00859825 · 00859825
Commit 00859825 authored Oct 16, 2023 by lintangsutawika
Show whitespace changes
Inline Side-by-side

Showing with 15 additions and 37 deletions

lm_eval/evaluator.py lm_eval/evaluator.py +15 -34

lm_eval/tasks/mmlu/default/_default_template_yaml lm_eval/tasks/mmlu/default/_default_template_yaml +0 -3

No files found.
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -219,7 +219,6 @@ def evaluate(
    padding_requests = collections.defaultdict(int)
    # store the hierarchy to do proper ordering
    task_hierarchy = collections.defaultdict(list)
-    group_hierarchy = collections.defaultdict(list)
    # store the ordering of tasks and groups
    task_order = collections.defaultdict(int)
    # store the aggregation for aggregating across tasks in the same group
@@ -450,27 +449,8 @@ def evaluate(
                group_name = None

            agg_fn = task.aggregation()[metric]
-            task_score = agg_fn(items)
-            task_size = len(items)
-
-            # if group_name is not None:
-            #     sample_metric_key = metric + "(sample agg)," + key
-            #     for grouping in task_to_group[task_name]:
-            #         if metric_key in results[grouping]:
-            #             results[grouping][metric_key].append(task_score)
-            #             results[grouping]["size"].append(task_size)
-            #         else:
-            #             results[grouping][metric_key] = [task_score]
-            #             results[grouping]["size"] = [task_size]
-
-            #         if sample_metric_key in results[grouping]:
-            #             results[grouping][sample_metric_key] += items
-            #         else:
-            #             results[grouping][sample_metric_key] = items.copy()
-            #             sample_agg_fn[grouping][sample_metric_key] = agg_fn
-
-            results[task_name][metric_key] = task_score
-            results[task_name]["size"] = task_size
+            results[task_name][metric_key] = agg_fn(items)
+            results[task_name]["samples"] = len(items)

            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
            # so we run them less iterations. still looking for a cleaner way to do this
@@ -494,28 +474,29 @@ def evaluate(
                for task in task_list:
                    metrics = results[task]

-                    if "size" in metrics:
-                        current_size = metrics.pop("size")
-                    else:
-                        current_size = 1
-
-                    for metric in [key for key in metrics.keys()]:
+                    current_size = metrics.pop("samples")
+                    # if "size" in metrics:
+                    #     current_size = metrics.pop("size")
+                    # else:
+                    #     current_size = 1

-                        if "_stderr" in metric:
-                            print(metric)
+                    for metric in [key for key in metrics.keys() if "_stderr" not in key]:

+                        stderr = "_stderr,".join(metric.split(","))
+                        stderr_score = results[task][stderr]
                        metric_score = results[task][metric]

                        if metric in results[group]:
                            results[group][metric] = (results[group][metric]*total_size + metric_score*current_size)/(total_size+current_size)
+                            # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
+                            results[group][stderr] = ((total_size-1)*results[group][stderr]+(current_size-1)*stderr_score)/(total_size + current_size - 1) \
+                                                        + total_size*current_size/((total_size+current_size)*(total_size+current_size-1))*(results[group][metric] - metric_score)**2
                        else:
                            results[group][metric] = metric_score
-
-                    # Different formula for agg stderr
-     
+                            results[group][stderr] = stderr_score

                    total_size += current_size
-
+                results[group]["samples"] = total_size

        for task_name, task in task_dict.items():
            if type(task) == tuple:

--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -11,6 +11,3 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
-  - metric: acc_norm
-    aggregation: mean
-    higher_is_better: true