fix processing regression; recursively parse lists fron template

e66aa10c · Baber · 3d5fa4c7 · e66aa10c · e66aa10c · e66aa10c
Commit e66aa10c authored Aug 02, 2025 by Baber
Showing with 22 additions and 8 deletions

lm_eval/api/task.py lm_eval/api/task.py +6 -5

lm_eval/tasks/super_glue/copa/default.yaml lm_eval/tasks/super_glue/copa/default.yaml +1 -1

lm_eval/utils.py lm_eval/utils.py +15 -2

No files found.
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -18,6 +18,7 @@ from typing import (
    Optional,
    Tuple,
    Union,
+    cast,
 )
 import datasets
@@ -1382,9 +1383,9 @@ class ConfigurableTask(Task):
            if doc_to_choice in self.features:
                return doc[doc_to_choice]
            else:
-                return utils.apply_template(doc_to_choice, doc)
+                return cast(list, utils.apply_template(doc_to_choice, doc))
        elif isinstance(doc_to_choice, list):
-            return doc_to_choice
+            return utils.apply_template(doc_to_choice, doc)
        elif isinstance(doc_to_choice, dict):
            return list(doc_to_choice.values())
        elif callable(doc_to_choice):
@@ -1606,8 +1607,8 @@ class ConfigurableTask(Task):
            pred = np.argmax(lls)
            pred_norm = np.argmax(lls / completion_len)
-            gold = (
+            gold = backup = (
-                self.doc_to_text(doc)
+                self.doc_to_target(doc)
                if not self.multiple_inputs
                else self.doc_to_text(doc)
            )
@@ -1625,7 +1626,7 @@ class ConfigurableTask(Task):
            if gold_index_error:
                eval_logger.warning(
-                    f"Label index was not in within range of available choices,"
+                    f"Label [{backup}] index was not in within range of available choices {choices},"
                    f"Sample:\n\n{doc}\n\n"
                )

--- a/lm_eval/tasks/super_glue/copa/default.yaml
+++ b/lm_eval/tasks/super_glue/copa/default.yaml
@@ -8,7 +8,7 @@ training_split: train
 validation_split: validation
 doc_to_text: !function utils.doc_to_text
 doc_to_target: label
-doc_to_choice:  "{{ [choice1, choice2] }}"
+doc_to_choice: ["{{ choice1 }}",  "{{ choice2 }}"]
 metric_list:
  - metric: acc
 metadata:

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -11,7 +11,7 @@ import re
 from dataclasses import asdict, is_dataclass
 from itertools import islice
 from pathlib import Path
-from typing import Any, Callable, Generator, List, Optional, Tuple
+from typing import Any, Callable, Generator, List, Optional, Tuple, Union, overload
 import numpy as np
 import yaml
@@ -545,7 +545,20 @@ env = Environment(
 env.filters["regex_replace"] = regex_replace
-def apply_template(template: str, doc: dict) -> str:
+@overload
+def apply_template(template: str, doc: dict[str, Any]) -> str: ...
+@overload
+def apply_template(template: list[str], doc: dict[str, Any]) -> list[str]: ...
+def apply_template(template: Union[str, list[str]], doc: dict) -> Union[str, list[str]]:
+    if isinstance(template, list):
+        return [
+            apply_template(x, doc) if (x.startswith("{{") and x.endswith("}}")) else x
+            for x in template
+        ]
    rtemplate = env.from_string(template)
    return rtemplate.render(**doc)