merged cont-metrics here

6998762a · lintangsutawika · 2184b8de · 6998762a · 6998762a · 6998762a
Commit 6998762a authored Nov 09, 2023 by lintangsutawika
12 changed files
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -109,9 +109,9 @@ def ter(items):
 @register_aggregation("brier_score")
 def brier_score(items):  # This is a passthrough function
    gold, predictions = list(zip(*items))
-    gold = list(gold)
+    gold = np.array(gold)
-    gold_one_hot = np.eye(np.max(gold) + 1)[gold]
+    predictions = np.array(predictions)
-    predictions = list(zip(*items))[1]
+    gold_one_hot = np.eye(len(predictions[0]))[gold]
    return np.mean(np.sum((predictions - gold_one_hot) ** 2, axis=1))

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -468,6 +468,8 @@ def evaluate(
                if stderr is not None:
                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)
+                else:
+                    results[task_name][metric + "_stderr" + "," + key] = 0
        if bool(results):

--- a/lm_eval/tasks/mmlu/alternative_worlds/full_continuation/style_01/_template_yaml
+++ b/lm_eval/tasks/mmlu/alternative_worlds/full_continuation/style_01/_template_yaml
@@ -10,6 +10,4 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
-  - metric: acc_norm
+  - metric: brier_score
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/mmlu/alternative_worlds/full_continuation/style_02/_template_yaml
+++ b/lm_eval/tasks/mmlu/alternative_worlds/full_continuation/style_02/_template_yaml
@@ -10,6 +10,4 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
-  - metric: acc_norm
+  - metric: brier_score
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/mmlu/alternative_worlds/full_continuation/style_03/_template_yaml
+++ b/lm_eval/tasks/mmlu/alternative_worlds/full_continuation/style_03/_template_yaml
@@ -10,6 +10,4 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
-  - metric: acc_norm
+  - metric: brier_score
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/mmlu/alternative_worlds/full_continuation/style_04/_template_yaml
+++ b/lm_eval/tasks/mmlu/alternative_worlds/full_continuation/style_04/_template_yaml
@@ -10,6 +10,4 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
-  - metric: acc_norm
+  - metric: brier_score
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/mmlu/alternative_worlds/full_continuation/style_05/_template_yaml
+++ b/lm_eval/tasks/mmlu/alternative_worlds/full_continuation/style_05/_template_yaml
@@ -10,6 +10,4 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
-  - metric: acc_norm
+  - metric: brier_score
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/mmlu/alternative_worlds/letters_only/style_01/_template_yaml
+++ b/lm_eval/tasks/mmlu/alternative_worlds/letters_only/style_01/_template_yaml
@@ -10,6 +10,4 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
-  - metric: acc_norm
+  - metric: brier_score
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/mmlu/alternative_worlds/letters_only/style_02/_template_yaml
+++ b/lm_eval/tasks/mmlu/alternative_worlds/letters_only/style_02/_template_yaml
@@ -10,6 +10,4 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
-  - metric: acc_norm
+  - metric: brier_score
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/mmlu/alternative_worlds/letters_only/style_03/_template_yaml
+++ b/lm_eval/tasks/mmlu/alternative_worlds/letters_only/style_03/_template_yaml
@@ -10,6 +10,4 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
-  - metric: acc_norm
+  - metric: brier_score
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/mmlu/alternative_worlds/letters_only/style_04/_template_yaml
+++ b/lm_eval/tasks/mmlu/alternative_worlds/letters_only/style_04/_template_yaml
@@ -10,6 +10,4 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
-  - metric: acc_norm
+  - metric: brier_score
-    aggregation: mean
-    higher_is_better: true
--- a/lm_eval/tasks/mmlu/alternative_worlds/letters_only/style_05/_template_yaml
+++ b/lm_eval/tasks/mmlu/alternative_worlds/letters_only/style_05/_template_yaml
@@ -10,6 +10,4 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
-  - metric: acc_norm
+  - metric: brier_score
-    aggregation: mean
-    higher_is_better: true