Commit a6bd7126 authored by Herbie Bradley's avatar Herbie Bradley
Browse files

Implement ece, remove plotting

parent dadfd4a8
import math
import random
from collections.abc import Iterable
import numpy as np
import sacrebleu
import sklearn.metrics
import random
from lm_eval.api.registry import register_metric, register_aggregation
from lm_eval.api.registry import register_aggregation, register_metric
# Register Aggregations First
......@@ -56,6 +56,37 @@ def matthews_corrcoef(items):
return sklearn.metrics.matthews_corrcoef(golds, preds)
@register_aggregation("ece")
def ece(items: list) -> float:
probs: list[float] = []
scores: list[float] = []
for i in range(len(items)):
# Get only largest probability from each example
largest_idx = np.argmax(items[i]["probs"])
probs.append(items[i]["probs"][largest_idx])
scores.append(items[i]["scores"][largest_idx])
sorted_indices = np.argsort(probs)
sorted_probs = np.asarray(probs)[sorted_indices]
sorted_scores = np.asarray(scores)[sorted_indices]
def bin_to_subsets(array: np.ndarray, num_subsets: int = 10) -> np.ndarray:
subset_size: int = len(array) // num_subsets
remainder: int = len(array) % num_subsets
subsets: list[np.ndarray] = []
start: int = 0
for _ in range(num_subsets):
subset_end: int = start + subset_size + (1 if remainder > 0 else 0)
subsets.append(array[start:subset_end])
start = subset_end
remainder -= 1
return subsets
probs = np.asarray([np.mean(x) for x in bin_to_subsets(sorted_probs, 10)])
freqs = np.asarray([np.mean(x) for x in bin_to_subsets(sorted_scores, 10)])
return np.sum(np.abs(freqs - probs)) / len(freqs)
@register_metric(
metric="acc",
higher_is_better=True,
......@@ -86,6 +117,26 @@ def acc_mutual_info_fn(items): # This is a passthrough function
return items
@register_metric(
metric="ece",
higher_is_better=False,
output_type="multiple_choice",
aggregation="ece",
)
def ece_fn(items): # This is a passthrough function
"""
Expected Calibration Error (ECE).
This consists of the average absolute difference between the fraction of
model predictions which are correct and the mean of the model's normalized
probability for those predictions (after binning), for multiple choice questions.
Lower is better.
Paper: https://arxiv.org/abs/2207.05221
"""
return items
@register_metric(
metric="perplexity",
higher_is_better=False,
......
......@@ -651,8 +651,6 @@ class ConfigurableTask(Task):
if type(test_target) is list:
self.multiple_target = len(test_target)
self.calibrations: list = []
def download(self, dataset_kwargs=None):
self.dataset = datasets.load_dataset(
path=self.DATASET_PATH,
......@@ -948,10 +946,7 @@ class ConfigurableTask(Task):
choices = self.doc_to_choice(doc)
completion_len = np.array([float(len(i)) for i in choices])
if (
2 * len(choices) == len(lls)
and "acc_mutual_info" in self._metric_fn_list.keys()
):
if 2 * len(choices) == len(lls) and "acc_mutual_info" in use_metric:
# then we are doing mutual info.
# this stores the "dryrun" / unconditional answer loglikelihoods
lls_unconditional = lls[1::2]
......@@ -968,18 +963,27 @@ class ConfigurableTask(Task):
gold = self.doc_to_target(doc)
if type(gold) is str:
gold = choices.index(gold)
# Convert lls from log-probabilities to normalized probabilities
norm_probs = np.exp(lls - sp.logsumexp(lls))
print(norm_probs)
if "ece" in use_metric:
# Convert lls from log-probabilities to normalized probabilities
norm_probs: np.ndarray = np.exp(lls - sp.logsumexp(lls))
calib_scores: np.ndarray = np.zeros(len(choices))
if isinstance(gold, list):
for g in gold:
calib_scores[g] = 1.0
else:
calib_scores[gold] = 1.0
calibration_probs: dict[str, np.ndarray] = {
"probs": norm_probs,
"scores": calib_scores,
}
if self.multiple_target:
acc = 1.0 if pred in gold else 0.0
acc_norm = 1.0 if pred_norm in gold else 0.0
exact_match = int(any([is_greedy[i] for i in gold]))
else:
acc = 1.0 if pred == gold else 0.0
for i, choice in enumerate(choices):
calib_score = 1.0 if i == gold else 0.0
self.calibrations.append((norm_probs[i], calib_score))
acc_norm = 1.0 if pred_norm == gold else 0.0
# TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
exact_match = int(is_greedy[gold])
......@@ -990,6 +994,7 @@ class ConfigurableTask(Task):
**({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
**({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
**({"exact_match": exact_match} if "exact_match" in use_metric else {}),
**({"ece": calibration_probs} if "ece" in use_metric else {}),
}
if "acc_mutual_info" in use_metric:
......
......@@ -341,32 +341,6 @@ def evaluate(
for metric, value in metrics.items():
vals[(task_name, key, metric)].append(value)
calibs = sorted(task.calibrations, key=lambda x: x[0])
def bin_list_into_subsets(input_list, num_subsets=10):
subset_size = len(input_list) // num_subsets
remainder = len(input_list) % num_subsets
subsets = []
start = 0
for _ in range(num_subsets):
subset_end = start + subset_size + (1 if remainder > 0 else 0)
subsets.append(input_list[start:subset_end])
start = subset_end
remainder -= 1
return subsets
subsets = bin_list_into_subsets(calibs, 10)
x_coords = [np.mean([x[0] for x in subset]) for subset in subsets]
y_coords = [np.mean([x[1] for x in subset]) for subset in subsets]
model_name = lm.config._name_or_path.split("/")[1]
plt.plot(x_coords, y_coords, label=model_name)
plt.plot([0, 1], [0, 1], linestyle="--", color="black")
plt.xlabel("Probabilities")
plt.ylabel("Frequences")
plt.title("Calibration")
plt.legend()
plt.savefig(f"{model_name}-long.png")
if lm.world_size > 1:
# if multigpu, then gather data across all ranks
# first gather logged samples across all ranks
......
......@@ -5,8 +5,7 @@ output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
num_fewshot: 5
doc_to_choice: !function utils_logiqa.doc_to_choice
doc_to_choice: "{{options}}"
doc_to_text: !function utils_logiqa.doc_to_text
doc_to_target: !function utils_logiqa.doc_to_target
doc_to_decontamination_query: "{{context}}"
......
task: logiqa_calibration
dataset_path: EleutherAI/logiqa
dataset_name: logiqa
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
num_fewshot: 5
fewshot_split: train
doc_to_choice: !function utils_logiqa.doc_to_choice
doc_to_text: !function utils_logiqa.doc_to_text
doc_to_target: !function utils_logiqa.doc_to_target
doc_to_decontamination_query: "{{context}}"
should_decontaminate: true
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: ece
aggregation: ece
higher_is_better: false
......@@ -24,5 +24,5 @@ def doc_to_target(doc) -> int:
return choices.index(doc["label"].strip())
def doc_to_choice(doc):
def doc_to_choice(doc) -> list:
return ["(A)", "(B)", "(C)", "(D)"]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment