"...lm-evaluation-harness.git" did not exist on "e86e7b2767e0bbe2413708197641e28d04f9feaa"
Unverified Commit 04dde62c authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Update evaluator.py

parent 8680e938
......@@ -524,19 +524,37 @@ def evaluate(
# or `task_name: []`.
# we only want to operate on groups here.
continue
for metric in [
key
for key in results[task_list[0]].keys()
if "_stderr" not in key and key not in ["alias", "samples"]
]: # TODO: what if tasks don't all share the same metrics
group_metrics = list(
dict.fromkeys(
[
key
for task in task_list
for key in results[task].keys()
if "_stderr" not in key and key not in ["alias", "samples"]
]
)
)
for metric in group_metrics:
# TODO: what if tasks don't all share the same metrics
stderr = "_stderr,".join(metric.split(","))
# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric] for task in task_list
results[task][metric]
for task in task_list
if metric in results[task]
] # TODO: copy?
stderrs = [results[task][stderr] for task in task_list]
sizes = [results[task]["samples"] for task in task_list]
stderrs = [
results[task][stderr]
for task in task_list
if stderr in results[task]
]
sizes = [
results[task]["samples"]
for task in task_list
if metric in results[task]
]
# compute group's pooled metric and stderr
results[group][
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment