evaloator: take min bootstrap steps for blue/chrf/ter

dd3b4f7a · Leo Gao · d67c77be · dd3b4f7a
Commit dd3b4f7a authored Aug 26, 2021 by Leo Gao
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 1 deletion

lm_eval/evaluator.py lm_eval/evaluator.py +1 -1

No files found.
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -95,7 +95,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i

        # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
        # so we run them less iterations. still looking for a cleaner way to do this
-        stderr = lm_eval.metrics.stderr_for_metric(task.aggregation()[metric], bootstrap_iters=1000 if metric in ["bleu", "chrf", "ter"] else bootstrap_iters)
+        stderr = lm_eval.metrics.stderr_for_metric(task.aggregation()[metric], bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters)
        if stderr is not None:
            results[task_name][metric + "_stderr"] = stderr(items)