Allow sample logging of calibration data

50a8ddfb · Herbie Bradley · a6bd7126 · 50a8ddfb · 50a8ddfb
Commit 50a8ddfb authored Aug 19, 2023 by Herbie Bradley
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 4 deletions

lm_eval/api/metrics.py lm_eval/api/metrics.py +0 -1

lm_eval/api/task.py lm_eval/api/task.py +3 -3

No files found.
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -130,7 +130,6 @@ def ece_fn(items):  # This is a passthrough function
    This consists of the average absolute difference between the fraction of
    model predictions which are correct and the mean of the model's normalized
    probability for those predictions (after binning), for multiple choice questions.
-    Lower is better.

    Paper: https://arxiv.org/abs/2207.05221
    """

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -966,14 +966,14 @@ class ConfigurableTask(Task):

            if "ece" in use_metric:
                # Convert lls from log-probabilities to normalized probabilities
-                norm_probs: np.ndarray = np.exp(lls - sp.logsumexp(lls))
-                calib_scores: np.ndarray = np.zeros(len(choices))
+                norm_probs: list[float] = np.exp(lls - sp.logsumexp(lls)).tolist()
+                calib_scores: list[float] = [0.0] * len(choices)
                if isinstance(gold, list):
                    for g in gold:
                        calib_scores[g] = 1.0
                else:
                    calib_scores[gold] = 1.0
-                calibration_probs: dict[str, np.ndarray] = {
+                calibration_probs: dict[str, list[float]] = {
                    "probs": norm_probs,
                    "scores": calib_scores,
                }