Commit 2d843472 authored by jon-tow's avatar jon-tow
Browse files

feat: temp remove bootstrap calc

parent 83dbfbf6
......@@ -284,15 +284,12 @@ def evaluate(
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
stderr = lm_eval.metrics.stderr_for_metric(
metric=task.aggregation()[real_metric],
bootstrap_iters=min(bootstrap_iters, 1000)
if metric in ["bleu", "chrf", "ter"]
else bootstrap_iters,
)
if stderr is not None:
results[task_name][metric + "_stderr"] = stderr(items)
# stderr = lm_eval.metrics.stderr_for_metric(
# metric=task.aggregation()[real_metric],
# bootstrap_iters=min(bootstrap_iters, 1000)
# if metric in ["bleu", "chrf", "ter"]
# else bootstrap_iters,
# )
return {"results": dict(results), "versions": dict(versions)}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment