update default values; fixes

b89af51e · Baber · fadd26e4 · b89af51e · b89af51e · b89af51e
Commit b89af51e authored Jul 10, 2025 by Baber
10 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,12 +29,11 @@ repos:
      - id: mixed-line-ending
        args: [--fix=lf]
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.2
+    rev: v0.12.5
    hooks:
      # Run the linter.
      - id: ruff-check
-        args: [ --fix]
-        # Run the formatter.
+        args: [--fix]
      - id: ruff-format
  - repo: https://github.com/codespell-project/codespell
    rev: v2.4.1

--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
+from rich.traceback import install
+
 from lm_eval._cli.harness import HarnessCLI
 from lm_eval.utils import setup_logging


+install(show_locals=True)
+
+
 def cli_evaluate() -> None:
    """Main CLI entry point."""
    setup_logging()

--- a/lm_eval/_cli/run.py
+++ b/lm_eval/_cli/run.py
@@ -8,6 +8,8 @@ from functools import partial
 from lm_eval._cli.subcommand import SubCommand
 from lm_eval._cli.utils import (
    _int_or_none_list_arg_type,
+    key_val_to_dict,
+    merge_dicts,
    request_caching_arg_to_dict,
    try_parse_json,
 )
@@ -22,17 +24,17 @@ class Run(SubCommand):
            "run",
            help="Run the evaluation harness on specified tasks",
            description="Evaluate language models on various benchmarks and tasks.",
-            usage="lm-eval run --model <model> --tasks <task1,task2,...> [options]",
+            usage="lm-eval run --model <model> --tasks <task> <task> --model_args <arg=value> <arg=value> [options]",
            epilog=textwrap.dedent("""
                examples:
                  # Basic evaluation with HuggingFace model
-                  $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag
+                  $ lm-eval run --model hf --model_args pretrained=gpt2 dtype=float32 --tasks hellaswag

                  # Evaluate on multiple tasks with few-shot examples
-                  $ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy,arc_challenge --num_fewshot 5
+                  $ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy arc_challenge --num_fewshot 5

                  # Evaluation with custom generation parameters
-                  $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs "temperature=0.8,top_p=0.95"
+                  $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8 top_p=0.95 'stop=["\\n\\n"]'

                  # Use configuration file
                  $ lm-eval run --config my_config.yaml --tasks mmlu
@@ -73,9 +75,10 @@ class Run(SubCommand):
            "-t",
            default=None,
            type=str,
-            metavar="TASK1,TASK2",
+            nargs="*",
+            metavar="TASK1 TASK2",
            help=textwrap.dedent("""
-                Comma-separated list of task names or groupings.
+                Space or Comma-separated list of task names or groupings.
                Use 'lm-eval list tasks' to see all available tasks.
            """).strip(),
        )
@@ -83,9 +86,10 @@ class Run(SubCommand):
            "--model_args",
            "-a",
            default=None,
-            type=try_parse_json,
+            nargs="*",
+            type=key_val_to_dict,
            metavar="ARGS",
-            help="Model arguments as 'key=val,key2=val2' or JSON string",
+            help="Model arguments as 'key=val,key2=val2' or `key=val` `key2=val2`",
        )

        # Evaluation Settings
@@ -124,10 +128,14 @@ class Run(SubCommand):
        )
        eval_group.add_argument(
            "--gen_kwargs",
-            type=try_parse_json,
+            type=key_val_to_dict,
            default=None,
+            nargs="*",
            metavar="KWARGS",
-            help="Generation arguments as 'key=val,key2=val2' or JSON string",
+            help=textwrap.dedent(
+                'Generation arguments as `temperature=0,stop=["stop"]` or `key=val` `key2=val2`.'
+                "Values should be parsable with ast.literal_eval."
+            ),
        )

        # Data and Output
@@ -160,9 +168,10 @@ class Run(SubCommand):
            "-E",
            default=None,
            type=try_parse_json,
-            metavar="JSON_FILE",
+            metavar='"task1": [1,2,3,4,...]"',
            help=textwrap.dedent(
-                'JSON file with specific sample indices for inputs: {"task_name":[indices],...}. Incompatible with --limit.'
+                "`...` `...` Sample indices for inputs. Incompatible with --limit."
+                " Values be parsable with ast.literal_eval."
            ),
        )

@@ -250,24 +259,24 @@ class Run(SubCommand):
        )
        logging_group.add_argument(
            "--wandb_args",
-            type=str,
+            type=key_val_to_dict,
            default=argparse.SUPPRESS,
            metavar="ARGS",
-            help="Weights & Biases init arguments (key=val,key2=val2)",
+            help="Weights & Biases init arguments key=val key2=val2",
        )
        logging_group.add_argument(
            "--wandb_config_args",
-            type=str,
+            type=key_val_to_dict,
            default=argparse.SUPPRESS,
            metavar="ARGS",
-            help="Weights & Biases config arguments (key=val,key2=val2)",
+            help="Weights & Biases config arguments key=val key2=val2",
        )
        logging_group.add_argument(
            "--hf_hub_log_args",
-            type=str,
+            type=key_val_to_dict,
            default=argparse.SUPPRESS,
            metavar="ARGS",
-            help="Hugging Face Hub logging arguments (key=val,key2=val2)",
+            help="Hugging Face Hub logging arguments key=val key2=val2",
        )

        # Advanced Options
@@ -307,15 +316,28 @@ class Run(SubCommand):
            "--metadata",
            type=json.loads,
            default=None,
-            metavar="JSON",
+            metavar="`key=val` `key2=val2`",
            help=textwrap.dedent(
-                """JSON metadata for task configs (merged with model_args), required for some tasks such as RULER"""
+                """`key=val` `key2=val` args parsable by ast.literal_eval (merged with model_args),
+                required for some tasks such as RULER"""
            ),
        )

-    def _execute(self, args: argparse.Namespace) -> None:
+    @staticmethod
+    def _execute(args: argparse.Namespace) -> None:
        """Runs the evaluation harness with the provided arguments."""
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        MERGE_ARGS_DICTS = [
+            "model_args",
+            "gen_kwargs",
+            "wandb_args",
+            "wandb_config_args",
+            "hf_hub_log_args",
+        ]
+        for arg_name in MERGE_ARGS_DICTS:
+            if current_value := getattr(args, arg_name, None):
+                setattr(args, arg_name, merge_dicts(*current_value))
+
        from lm_eval.config.evaluate_config import EvaluatorConfig

        eval_logger = logging.getLogger(__name__)

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -8,7 +8,6 @@ import re
 from collections.abc import Callable
 from copy import deepcopy
 from functools import cached_property
-from types import MethodType
 from typing import TYPE_CHECKING, Any, Literal, overload

 import datasets
@@ -523,8 +522,8 @@ class Task(abc.ABC):
        #     self.aggregation = lambda: {
        #         metric_name: get_metric_aggregation(metric_name)
        #     }
-        setattr(self._config, "metric_list", [MetricConfig(name=metric_name)])
-        setattr(self._config, "process_results", lambda *args: {"bypass": 0})
+        self._config.metric_list = [MetricConfig(name=metric_name)]
+        self._config.process_results = lambda *args: {"bypass": 0}

    def set_fewshot_seed(self, seed: int | None = None) -> None:
        self.fewshot_rnd = random.Random(seed)
@@ -656,6 +655,18 @@ class ConfigurableTask(Task):
            )
        self.task_docs = self.eval_docs

+        # for name, fn in self.config._fn.items():
+        #     if hasattr(self, name):
+        #         setattr(
+        #             self,
+        #             name,
+        #             types.MethodType(
+        #                 lambda self, *args, _fn=fn, **kwargs: _fn(*args, **kwargs),
+        #                 self,
+        #             ),
+        #         )
+
+        self.runtime_checks(self.task_docs[0])

    def download(
        self, dataset_kwargs:dict[str, Any] | None = None, **kwargs
@@ -968,6 +979,8 @@ class ConfigurableTask(Task):
        # if self.prompt is not None:
        #     doc_to_text = self.prompt
        doc_to_text = doc_to_text or self.config.doc_to_text
+        if callable(doc_to_text):
+            return doc_to_text(doc)
        if doc_to_text in doc:
            return doc[doc_to_text]
        elif isinstance(doc_to_text, str):
@@ -1013,6 +1026,8 @@ class ConfigurableTask(Task):
        # if self.prompt is not None:
        #     doc_to_target = self.prompt
        doc_to_target = doc_to_target or self.config.doc_to_target
+        if callable(doc_to_target):
+            doc_to_target(doc)
        if doc_to_target in doc:
            return doc[doc_to_target]
        elif isinstance(doc_to_target, str):
@@ -1274,6 +1289,8 @@ class ConfigurableTask(Task):
        )

    def process_results(self, doc: dict, results: list) -> dict[str, Any]:
+        if callable(self.config.process_results):
+            return self.config.process_results(doc, results)
        result_dict = {}
        use_metric = list(m.metric_name for m in self.config._metric_list)
        if self.OUTPUT_TYPE == "loglikelihood":
@@ -1423,6 +1440,7 @@ class ConfigurableTask(Task):
        # Test One Doc
        self.features: list[str] = list(self.task_docs.features.keys())
        self.multiple_target = 0
+        self.multiple_input = 0
        test_text = self.doc_to_text(test_doc)
        test_target = self.doc_to_target(test_doc)

@@ -1430,13 +1448,19 @@ class ConfigurableTask(Task):
            test_choice = self.doc_to_choice(test_doc)
            if not isinstance(test_choice, list):
                eval_logger.error("doc_to_choice must return list")
-            # else:
-            #     num_choice = len(test_choice)
+            else:
+                num_choice = len(test_choice)
+
+            if isinstance(test_text, int):
+                eval_logger.debug(
+                    "doc_to_text returned an int. Assuming multiple inputs."
+                )

            if isinstance(test_text, int):
                eval_logger.debug(
                    "doc_to_text returned an int. Assuming multiple inputs."
                )
+                self.multiple_input = num_choice
        else:
            test_choice = None


--- a/lm_eval/config/evaluate_config.py
+++ b/lm_eval/config/evaluate_config.py
@@ -21,6 +21,7 @@ DICT_KEYS = [
    "hf_hub_log_args",
    "metadata",
    "model_args",
+    "gen_kwargs",
 ]


@@ -79,7 +80,7 @@ class EvaluatorConfig:

    # Device
    device: Optional[str] = field(
-        default=None, metadata={"help": "Device to use (e.g. cuda, cuda:0, cpu)"}
+        default="cuda:0", metadata={"help": "Device to use (e.g. cuda, cuda:0, cpu)"}
    )

    # Data sampling and limiting
@@ -126,7 +127,10 @@ class EvaluatorConfig:
        default=None, metadata={"help": "Custom System instruction to add"}
    )
    apply_chat_template: Union[bool, str] = field(
-        default=False, metadata={"help": "Apply chat template to prompt"}
+        default=False,
+        metadata={
+            "help": "Apply chat template to prompt. Either True, or a string identifying the tokenizer template."
+        },
    )
    fewshot_as_multiturn: bool = field(
        default=False,
@@ -170,7 +174,7 @@ class EvaluatorConfig:
        metadata={"help": "Seeds for random, numpy, torch, fewshot (random)"},
    )

-    # Security and safety
+    # Security
    trust_remote_code: bool = field(
        default=False, metadata={"help": "Trust remote code for HF datasets"}
    )
@@ -201,7 +205,7 @@ class EvaluatorConfig:
            config.update(cls.load_yaml_config(namespace.config))

        # Override with CLI args (only truthy values, exclude non-config args)
-        excluded_args = {"config", "command", "func"}  # argparse internal args
+        excluded_args = {"command", "func"}  # argparse internal args
        cli_args = {
            k: v for k, v in vars(namespace).items() if v and k not in excluded_args
        }
@@ -252,7 +256,6 @@ class EvaluatorConfig:

        try:
            yaml_data = yaml.safe_load(config_file.read_text())
-            print(textwrap.dedent(f"""yaml: {yaml_data}"""))
        except yaml.YAMLError as e:
            raise ValueError(f"Invalid YAML in {config_path}: {e}")
        except (OSError, UnicodeDecodeError) as e:
@@ -337,17 +340,10 @@ class EvaluatorConfig:
            metadata=self.metadata if self.metadata else {},
        )

-        # self.tasks is a comma-separated string of task names
-        if isinstance((task_list := self.tasks), str):
-            task_list = self.tasks.split(",")
-        else:
-            assert isinstance(self.tasks, list), (
-                "`tasks` must be a comma delimited string of task names or list[str]."
-            )
-        task_names = task_manager.match_tasks(task_list)
+        task_names = task_manager.match_tasks(self.tasks)

        # Check for any individual task files in the list
-        for task in [task for task in task_list if task not in task_names]:
+        for task in [task for task in self.tasks if task not in task_names]:
            task_path = Path(task)
            if task_path.is_file():
                config = utils.load_yaml_config(str(task_path))
@@ -355,7 +351,7 @@ class EvaluatorConfig:

        # Check for missing tasks
        task_missing = [
-            task for task in task_list if task not in task_names and "*" not in task
+            task for task in self.tasks if task not in task_names and "*" not in task
        ]

        if task_missing:

--- a/lm_eval/config/metric.py
+++ b/lm_eval/config/metric.py
@@ -38,7 +38,7 @@ class MetricConfig:
            return is_higher_better(self.name)
        return self.higher_is_better

-    def compute_metric(self, *args, **kwargs) -> Any:
+    def compute(self, *args, **kwargs) -> Any:
        """Calculates the metric using the provided function and arguments."""
        if self.fn is None:
            raise ValueError(f"Metric function for {self.name} is not defined.")

--- a/lm_eval/config/task.py
+++ b/lm_eval/config/task.py
@@ -10,7 +10,7 @@ import datasets
 from lm_eval.api.filter import FilterEnsemble
 from lm_eval.api.instance import OutputType
 from lm_eval.config.metric import MetricConfig
-from lm_eval.config.utils import doc_to_closure, maybe_serialize
+from lm_eval.config.utils import maybe_serialize


 if TYPE_CHECKING:
@@ -364,7 +364,7 @@ class TaskConfig:
    @classmethod
    def from_yaml(cls, data: dict[str, Any]) -> TaskConfig:
        """Create a TaskConfig instance from a YAML-like dictionary."""
-        fn = {k: doc_to_closure(v) for k, v in data.items() if callable(v)}
+        fn = {k: v for k, v in data.items() if callable(v)}
        return cls(**data, _fn=fn)

    @classmethod

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -475,7 +475,9 @@ def evaluate(
            "Either 'limit' or 'samples' must be None, but both are not None."
        )
    if samples is not None:
-        eval_logger.info(f"Evaluating examples for tasks {list(samples.keys())}")
+        eval_logger.info(
+            f"Evaluating examples for tasks {[x for x in list(samples.keys()) if x in task_dict.keys()]}"
+        )
    if apply_chat_template:
        eval_logger.warning(
            "Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,10 +11,10 @@ authors = [
 description = "A framework for evaluating language models"
 readme = "README.md"
 classifiers = [
-  "Development Status :: 3 - Alpha",
-  "Programming Language :: Python :: 3",
-  "License :: OSI Approved :: MIT License",
-  "Operating System :: OS Independent"
+    "Development Status :: 3 - Alpha",
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
 ]
 requires-python = ">=3.9"
 license = { "text" = "MIT" }

--- a/templates/example_ci_config.yaml
+++ b/templates/example_ci_config.yaml
@@ -4,11 +4,12 @@
 # instead of passing them as command-line arguments.
 #
 # Usage:
-#   $ lm_eval --config configs/default_config.yaml
+#   $ lm_eval --config templates/example_ci_config.yaml
 #
-# You can override any values in this config with command-line arguments:
-#   $ lm_eval --config configs/default_config.yaml --model_args pretrained=gpt2 --tasks mmlu
+# You can override any values in this config with further command-line arguments:
+#   $ lm_eval --config templates/example_ci_config.yaml --model_args pretrained=gpt2 --tasks mmlu
 #
+# For expected types and values, refer to EvaluatorConfig in lm_eval/config/evaluate_config.py
 # All parameters are optional and have the same meaning as their CLI counterparts.

 model: hf
@@ -17,9 +18,18 @@ model_args:
  dtype: float16
 tasks:
  - hellaswag
-  - gsm8k
+  - arc_easy
 batch_size: 1
 trust_remote_code: true
 log_samples: true
 output_path: ./test
-limit: 10
+gen_kwargs:
+  do_sample: true
+  temperature: 0.7
+  stop: ["\n", "<|endoftext|>"]
+samples:
+  hellaswag: [1,2,3,4,5,6,7,8,9,10]
+  arc_easy: [10,20,30,40,50,60,70,80,90,100]
+metadata:
+  name: Example CI Config
+  description: This is an example configuration file for testing purposes.