Commit 82ec4f53 authored by Leo Gao's avatar Leo Gao
Browse files

evaluator: fix bleu/chrf/ter taking forever

parent 198ca732
......@@ -93,7 +93,9 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
task = task_dict[task_name]
results[task_name][metric] = task.aggregation()[metric](items)
stderr = lm_eval.metrics.stderr_for_metric(task.aggregation()[metric], bootstrap_iters=bootstrap_iters)
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
stderr = lm_eval.metrics.stderr_for_metric(task.aggregation()[metric], bootstrap_iters=1000 if metric in ["bleu", "chrf", "ter"] else bootstrap_iters)
if stderr is not None:
results[task_name][metric + "_stderr"] = stderr(items)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment