]=None# by default, not used in the code. allows for users to pass arbitrary info to tasks
]=None# by default, not used in the code. allows for users to pass arbitrary info to tasks
def__post_init__(self)->None:
def__post_init__(self)->None:
ifself.groupisnotNone:
eval_logger.warning(
"A task YAML file was found to contain a `group` key. Groups which provide aggregate scores over several subtasks now require a separate config file--if not aggregating, you may want to use the `tag` config option instead within your config. Setting `group` within a TaskConfig will be deprecated in v0.4.4. Please see https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md for more information."
)
ifself.tagisNone:
self.tag=self.group
else:
raiseValueError(
"Got both a `group` and `tag` entry within a TaskConfig. Please use one or the other--`group` values will be deprecated in v0.4.4."
f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'"
)
results[group_or_task][metric]=aggregate_fn(
results[group_or_task][metric]=aggregate_fn(
metrics,
metrics,
sizes,
sizes,
metric_config["weight_by_size"],
metric_config["weight_by_size"],
)
)
# TODO: calculate grouped metric using aggregation fn
# TODO: calculate groups' metrics using arbitrary agg fns
if"N/A"instderrs:
if"N/A"instderrs:
results[group_or_task][stderr]="N/A"
results[group_or_task][stderr]="N/A"
else:
else:
# TODO: put in a warning, if we are using non-micro avg mean or another aggregation fn
results[group_or_task][
results[group_or_task][
stderr
stderr
]=lm_eval.api.metrics.pooled_sample_stderr(
]=lm_eval.api.metrics.pooled_sample_stderr(
stderrs,sizes
stderrs,sizes
)
)
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line: