modify evaluator metrics to calcualte each repeat

acf454b7 · Baber · 28001d29 · acf454b7 · acf454b7
Commit acf454b7 authored May 14, 2025 by Baber
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 5 deletions

lm_eval/api/task.py lm_eval/api/task.py +1 -1

lm_eval/evaluator.py lm_eval/evaluator.py +6 -4

No files found.
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -884,7 +884,7 @@ class ConfigurableTask(Task):
            eval_logger.debug(
                "No custom filters defined. Using default 'take_first' filter for handling repeats."
            )
-            self._filters = [build_filter_ensemble("none", [["take_first", None]])]
+            # self._filters = [build_filter_ensemble("none", [["take_first", None]])]

        if self.config.use_prompt is not None:
            eval_logger.info(f"loading prompt {self.config.use_prompt}")

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -613,9 +613,11 @@ def evaluate(
                else:
                    doc_id_true = doc_id
                requests = instances_by_doc_id[doc_id]
-                metrics = task.process_results(
-                    doc, [req.filtered_resps[filter_key] for req in requests]
-                )
+                metrics: list[dict] = [
+                    task.process_results(doc, response)
+                    for req in requests
+                    for response in req.filtered_resps[filter_key]
+                ]
                if log_samples:
                    target = task.doc_to_target(doc)
                    example = {
@@ -628,7 +630,7 @@ def evaluate(
                            req.filtered_resps[filter_key] for req in requests
                        ],
                        "filter": filter_key,
-                        "metrics": list(metrics.keys()),
+                        "metrics": list(set(m.keys() for m in metrics)),
                        "doc_hash": hash_string(
                            json.dumps(
                                requests[0].doc,