fixed brier_score to allow multi-gpu inference

386d63ea · lintangsutawika · 4efa0b6d · 386d63ea · 386d63ea
Commit 386d63ea authored Dec 15, 2023 by lintangsutawika
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

lm_eval/api/metrics.py lm_eval/api/metrics.py +1 -2

lm_eval/api/task.py lm_eval/api/task.py +2 -1

No files found.
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -126,8 +126,7 @@ def brier_score(items):  # This is a passthrough function
    for g, p in zip(gold_group.values(), pred_group.values()):
        _p = np.array(p)
        _g = np.array(g)
-        _g_one_hot = np.eye(len(_p[0]))[_g]
-        average += np.mean(np.sum((_p - _g_one_hot) ** 2, axis=1)) * len(_g)
+        average += np.mean(np.sum((_p - _g) ** 2, axis=1)) * len(_g)
        total_size += len(_g)

    return average / total_size

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1116,7 +1116,8 @@ class ConfigurableTask(Task):
                **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
                **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
                **(
-                    {"brier_score": (gold, prob_norm)}
+                    # {"brier_score": (gold, prob_norm)}
+                    {"brier_score": [np.eye(len(prob_norm))[gold], prob_norm]}
                    if "brier_score" in use_metric
                    else {}
                ),