Add comment

7f7673ec · Leo Gao · 59aff21d · 7f7673ec · 7f7673ec
Commit 7f7673ec authored May 10, 2021 by Leo Gao
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 1 deletion

lm_eval/metrics.py lm_eval/metrics.py +6 -0

tests/test_misc.py tests/test_misc.py +1 -1

No files found.
--- a/lm_eval/metrics.py
+++ b/lm_eval/metrics.py
@@ -175,6 +175,12 @@ def _sacreformat(refs, preds):


 def bootstrap_stderr(f, xs, iters=100000):
+    # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
+    # equivalent to stderr calculated without Bessel's correction in the stddev. 
+    # Unfortunately, I haven't been able to figure out what the right correction is
+    # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
+    # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
+    # Thankfully, shouldn't matter because our samples are pretty big usually anyways
    rnd = random.Random()
    rnd.seed(42)
    res = []

--- a/tests/test_misc.py
+++ b/tests/test_misc.py
@@ -7,6 +7,6 @@ def test_bootstrapping():
    random.seed(42)
    arr = [random.random() for _ in range(1000)]
    expected = metrics.mean_stderr(arr)
-    bootstrapped = metrics.bootstrap_stderr(metrics.mean, arr, iters=100000)
+    bootstrapped = metrics.bootstrap_stderr(metrics.mean, arr)

    assert bootstrapped == pytest.approx(expected, abs=1e-4)