import math import random def pop_stddev(arr): mu = sum(arr) / len(arr) return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr)) def sample_stddev(arr): mu = sum(arr) / len(arr) return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1)) def mean_stderr(arr): return sample_stddev(arr) / math.sqrt(len(arr)) def acc_all_stderr(items): # Only count as correct if all answers are labeled correctly for each question question_scoring_dict = {} preds = list(zip(*items))[0] docs = list(zip(*items))[1] for doc, pred in zip(docs, preds): question_id = doc["idx"]["question"] if question_id not in question_scoring_dict: question_scoring_dict[question_id] = [] gold_label = doc["label"] == 1 question_scoring_dict[question_id].append(gold_label == pred) acc = mean_stderr([int(all(x)) for x in question_scoring_dict.values()]) return acc def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): """Compute max metric between prediction and each ground truth.""" scores_for_ground_truths = [] for ground_truth in ground_truths: score = metric_fn(prediction, ground_truth) scores_for_ground_truths.append(score) return max(scores_for_ground_truths) class _bootstrap_internal: def __init__(self, f, n): self.f = f self.n = n def __call__(self, v): i, xs = v rnd = random.Random() rnd.seed(i) res = [] for _ in range(self.n): res.append(self.f(rnd.choices(xs, k=len(xs)))) return res def bootstrap_stderr(f, xs, iters): import multiprocessing as mp pool = mp.Pool(mp.cpu_count()) # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something # equivalent to stderr calculated without Bessel's correction in the stddev. # Unfortunately, I haven't been able to figure out what the right correction is # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator) # Thankfully, shouldn't matter because our samples are pretty big usually anyways res = [] chunk_size = min(1000, iters) from tqdm import tqdm print("bootstrapping for stddev:", f.__name__) for bootstrap in tqdm( pool.imap( _bootstrap_internal(f, chunk_size), [(i, xs) for i in range(iters // chunk_size)], ), total=iters // chunk_size, ): # sample w replacement res.extend(bootstrap) pool.close() return sample_stddev(res) def yesno(x): if x: return "yes" else: return "no"