Implement ece, remove plotting

a6bd7126 · Herbie Bradley · dadfd4a8 · a6bd7126 · a6bd7126 · a6bd7126
Commit a6bd7126 authored Aug 19, 2023 by Herbie Bradley
6 changed files
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
 import math
+import random
 from collections.abc import Iterable

 import numpy as np
 import sacrebleu
 import sklearn.metrics
-import random

-from lm_eval.api.registry import register_metric, register_aggregation
+from lm_eval.api.registry import register_aggregation, register_metric


 # Register Aggregations First
@@ -56,6 +56,37 @@ def matthews_corrcoef(items):
    return sklearn.metrics.matthews_corrcoef(golds, preds)


+@register_aggregation("ece")
+def ece(items: list) -> float:
+    probs: list[float] = []
+    scores: list[float] = []
+    for i in range(len(items)):
+        # Get only largest probability from each example
+        largest_idx = np.argmax(items[i]["probs"])
+        probs.append(items[i]["probs"][largest_idx])
+        scores.append(items[i]["scores"][largest_idx])
+
+    sorted_indices = np.argsort(probs)
+    sorted_probs = np.asarray(probs)[sorted_indices]
+    sorted_scores = np.asarray(scores)[sorted_indices]
+
+    def bin_to_subsets(array: np.ndarray, num_subsets: int = 10) -> np.ndarray:
+        subset_size: int = len(array) // num_subsets
+        remainder: int = len(array) % num_subsets
+        subsets: list[np.ndarray] = []
+        start: int = 0
+        for _ in range(num_subsets):
+            subset_end: int = start + subset_size + (1 if remainder > 0 else 0)
+            subsets.append(array[start:subset_end])
+            start = subset_end
+            remainder -= 1
+        return subsets
+
+    probs = np.asarray([np.mean(x) for x in bin_to_subsets(sorted_probs, 10)])
+    freqs = np.asarray([np.mean(x) for x in bin_to_subsets(sorted_scores, 10)])
+    return np.sum(np.abs(freqs - probs)) / len(freqs)
+
+
 @register_metric(
    metric="acc",
    higher_is_better=True,
@@ -86,6 +117,26 @@ def acc_mutual_info_fn(items):  # This is a passthrough function
    return items


+@register_metric(
+    metric="ece",
+    higher_is_better=False,
+    output_type="multiple_choice",
+    aggregation="ece",
+)
+def ece_fn(items):  # This is a passthrough function
+    """
+    Expected Calibration Error (ECE).
+
+    This consists of the average absolute difference between the fraction of
+    model predictions which are correct and the mean of the model's normalized
+    probability for those predictions (after binning), for multiple choice questions.
+    Lower is better.
+
+    Paper: https://arxiv.org/abs/2207.05221
+    """
+    return items
+
+
 @register_metric(
    metric="perplexity",
    higher_is_better=False,

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -651,8 +651,6 @@ class ConfigurableTask(Task):
        if type(test_target) is list:
            self.multiple_target = len(test_target)

-        self.calibrations: list = []
-
    def download(self, dataset_kwargs=None):
        self.dataset = datasets.load_dataset(
            path=self.DATASET_PATH,
@@ -948,10 +946,7 @@ class ConfigurableTask(Task):
            choices = self.doc_to_choice(doc)
            completion_len = np.array([float(len(i)) for i in choices])

-            if (
-                2 * len(choices) == len(lls)
-                and "acc_mutual_info" in self._metric_fn_list.keys()
-            ):
+            if 2 * len(choices) == len(lls) and "acc_mutual_info" in use_metric:
                # then we are doing mutual info.
                # this stores the "dryrun" / unconditional answer loglikelihoods
                lls_unconditional = lls[1::2]
@@ -968,18 +963,27 @@ class ConfigurableTask(Task):
                gold = self.doc_to_target(doc)
                if type(gold) is str:
                    gold = choices.index(gold)
-            # Convert lls from log-probabilities to normalized probabilities
-            norm_probs = np.exp(lls - sp.logsumexp(lls))
-            print(norm_probs)
+
+            if "ece" in use_metric:
+                # Convert lls from log-probabilities to normalized probabilities
+                norm_probs: np.ndarray = np.exp(lls - sp.logsumexp(lls))
+                calib_scores: np.ndarray = np.zeros(len(choices))
+                if isinstance(gold, list):
+                    for g in gold:
+                        calib_scores[g] = 1.0
+                else:
+                    calib_scores[gold] = 1.0
+                calibration_probs: dict[str, np.ndarray] = {
+                    "probs": norm_probs,
+                    "scores": calib_scores,
+                }
+
            if self.multiple_target:
                acc = 1.0 if pred in gold else 0.0
                acc_norm = 1.0 if pred_norm in gold else 0.0
                exact_match = int(any([is_greedy[i] for i in gold]))
            else:
                acc = 1.0 if pred == gold else 0.0
-                for i, choice in enumerate(choices):
-                    calib_score = 1.0 if i == gold else 0.0
-                    self.calibrations.append((norm_probs[i], calib_score))
                acc_norm = 1.0 if pred_norm == gold else 0.0
                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
                exact_match = int(is_greedy[gold])
@@ -990,6 +994,7 @@ class ConfigurableTask(Task):
                **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
                **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
                **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
+                **({"ece": calibration_probs} if "ece" in use_metric else {}),
            }

            if "acc_mutual_info" in use_metric:

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -341,32 +341,6 @@ def evaluate(
                for metric, value in metrics.items():
                    vals[(task_name, key, metric)].append(value)

-    calibs = sorted(task.calibrations, key=lambda x: x[0])
-
-    def bin_list_into_subsets(input_list, num_subsets=10):
-        subset_size = len(input_list) // num_subsets
-        remainder = len(input_list) % num_subsets
-        subsets = []
-        start = 0
-        for _ in range(num_subsets):
-            subset_end = start + subset_size + (1 if remainder > 0 else 0)
-            subsets.append(input_list[start:subset_end])
-            start = subset_end
-            remainder -= 1
-        return subsets
-
-    subsets = bin_list_into_subsets(calibs, 10)
-    x_coords = [np.mean([x[0] for x in subset]) for subset in subsets]
-    y_coords = [np.mean([x[1] for x in subset]) for subset in subsets]
-    model_name = lm.config._name_or_path.split("/")[1]
-    plt.plot(x_coords, y_coords, label=model_name)
-    plt.plot([0, 1], [0, 1], linestyle="--", color="black")
-    plt.xlabel("Probabilities")
-    plt.ylabel("Frequences")
-    plt.title("Calibration")
-    plt.legend()
-    plt.savefig(f"{model_name}-long.png")
-
    if lm.world_size > 1:
        # if multigpu, then gather data across all ranks
        # first gather logged samples across all ranks

--- a/lm_eval/tasks/logiqa/logiqa.yaml
+++ b/lm_eval/tasks/logiqa/logiqa.yaml
@@ -5,8 +5,7 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: test
-num_fewshot: 5
-doc_to_choice: !function utils_logiqa.doc_to_choice
+doc_to_choice: "{{options}}"
 doc_to_text: !function utils_logiqa.doc_to_text
 doc_to_target: !function utils_logiqa.doc_to_target
 doc_to_decontamination_query: "{{context}}"

--- a/lm_eval/tasks/logiqa/logiqa_calibration.yaml
+++ b/lm_eval/tasks/logiqa/logiqa_calibration.yaml
+task: logiqa_calibration
+dataset_path: EleutherAI/logiqa
+dataset_name: logiqa
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+num_fewshot: 5
+fewshot_split: train
+doc_to_choice: !function utils_logiqa.doc_to_choice
+doc_to_text: !function utils_logiqa.doc_to_text
+doc_to_target: !function utils_logiqa.doc_to_target
+doc_to_decontamination_query: "{{context}}"
+should_decontaminate: true
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+  - metric: ece
+    aggregation: ece
+    higher_is_better: false
--- a/lm_eval/tasks/logiqa/utils_logiqa.py
+++ b/lm_eval/tasks/logiqa/utils_logiqa.py
@@ -24,5 +24,5 @@ def doc_to_target(doc) -> int:
    return choices.index(doc["label"].strip())


-def doc_to_choice(doc):
+def doc_to_choice(doc) -> list:
    return ["(A)", "(B)", "(C)", "(D)"]