fix metrics

cf8a257c · Baber · 76e517d1 · cf8a257c · cf8a257c
Commit cf8a257c authored Dec 18, 2024 by Baber
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 6 deletions

lm_eval/api/task.py lm_eval/api/task.py +5 -4

lm_eval/tasks/longbench/metrics.py lm_eval/tasks/longbench/metrics.py +9 -2

No files found.
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -322,10 +322,11 @@ class Task(abc.ABC):
        elif self.has_validation_docs():
            return self.validation_docs()
        else:
-            eval_logger.warning(
+            if self.config.get("num_fewshot", 0) > 0:
-                f"[Task: {self.config.task}] has_training_docs and has_validation_docs are False"
+                eval_logger.warning(
-                ", using test_docs as fewshot_docs but this is not recommended."
+                    f"[Task: {self.config.task}] has_training_docs and has_validation_docs are False"
-            )
+                    ", using test_docs as fewshot_docs but this is not recommended."
+                )
            return self.test_docs()
    def _process_doc(self, doc: dict) -> dict:

--- a/lm_eval/tasks/longbench/metrics.py
+++ b/lm_eval/tasks/longbench/metrics.py
@@ -134,7 +134,10 @@ def rouge_zh_score(predictions: list[str], references: list[str], **kwargs) -> f
 def f1_score(predictions: list[str], references: list[str], **kwargs):
-    prediction, ground_truth = predictions[0], references[0]
+    try:
+        prediction, ground_truth = predictions[0], references[0]
+    except:
+        return 0.0
    common = Counter(prediction) & Counter(ground_truth)
    num_same = sum(common.values())
    if num_same == 0:
@@ -152,7 +155,11 @@ def qa_f1_score(predictions: list[str], references: list[str], **kwargs) -> floa
    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
-    return f1_score(prediction_tokens, ground_truth_tokens)
+    try:
+        res = f1_score(prediction_tokens, ground_truth_tokens)
+    except:
+        return 0.0
+    return res
 def qa_f1_zh_score(predictions: list[str], references: list[str], **kwargs) -> float: