Commit acf454b7 authored by Baber's avatar Baber
Browse files

modify evaluator metrics to calcualte each repeat

parent 28001d29
......@@ -884,7 +884,7 @@ class ConfigurableTask(Task):
eval_logger.debug(
"No custom filters defined. Using default 'take_first' filter for handling repeats."
)
self._filters = [build_filter_ensemble("none", [["take_first", None]])]
# self._filters = [build_filter_ensemble("none", [["take_first", None]])]
if self.config.use_prompt is not None:
eval_logger.info(f"loading prompt {self.config.use_prompt}")
......
......@@ -613,9 +613,11 @@ def evaluate(
else:
doc_id_true = doc_id
requests = instances_by_doc_id[doc_id]
metrics = task.process_results(
doc, [req.filtered_resps[filter_key] for req in requests]
)
metrics: list[dict] = [
task.process_results(doc, response)
for req in requests
for response in req.filtered_resps[filter_key]
]
if log_samples:
target = task.doc_to_target(doc)
example = {
......@@ -628,7 +630,7 @@ def evaluate(
req.filtered_resps[filter_key] for req in requests
],
"filter": filter_key,
"metrics": list(metrics.keys()),
"metrics": list(set(m.keys() for m in metrics)),
"doc_hash": hash_string(
json.dumps(
requests[0].doc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment