evaluator: fix bleu/chrf/ter taking forever

82ec4f53 · Leo Gao · 198ca732 · 82ec4f53
Commit 82ec4f53 authored Aug 26, 2021 by Leo Gao
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 1 deletion

lm_eval/evaluator.py lm_eval/evaluator.py +3 -1

No files found.
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -93,7 +93,9 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
        task = task_dict[task_name]
        results[task_name][metric] = task.aggregation()[metric](items)

-        stderr = lm_eval.metrics.stderr_for_metric(task.aggregation()[metric], bootstrap_iters=bootstrap_iters)
+        # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
+        # so we run them less iterations. still looking for a cleaner way to do this
+        stderr = lm_eval.metrics.stderr_for_metric(task.aggregation()[metric], bootstrap_iters=1000 if metric in ["bleu", "chrf", "ter"] else bootstrap_iters)
        if stderr is not None:
            results[task_name][metric + "_stderr"] = stderr(items)