fix: update default values and improve help text in configuration files

91e49e23 · Baber · 84d02f77 · 91e49e23 · 91e49e23 · 91e49e23
Commit 91e49e23 authored Jul 10, 2025 by Baber
4 changed files
--- a/lm_eval/_cli/run.py
+++ b/lm_eval/_cli/run.py
@@ -24,7 +24,7 @@ class Run(SubCommand):
            "run",
            help="Run the evaluation harness on specified tasks",
            description="Evaluate language models on various benchmarks and tasks.",
-            usage="lm-eval run --model <model> --tasks <task1,task2,...> [options]",
+            usage="lm-eval run --model <model> --tasks <task> <task> --model_args <arg=value> <arg=value> [options]",
            epilog=textwrap.dedent("""
                examples:
                  # Basic evaluation with HuggingFace model
@@ -34,7 +34,7 @@ class Run(SubCommand):
                  $ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy arc_challenge --num_fewshot 5

                  # Evaluation with custom generation parameters
-                  $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8 top_p=0.95
+                  $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8 top_p=0.95 'stop=["\\n\\n"]'

                  # Use configuration file
                  $ lm-eval run --config my_config.yaml --tasks mmlu
@@ -133,7 +133,8 @@ class Run(SubCommand):
            nargs="*",
            metavar="KWARGS",
            help=textwrap.dedent(
-                'Generation arguments as `temperature=0,stop=["stop"]` or `key=val` `key2=val2`. Values should be parsable with ast.literal_eval.'
+                'Generation arguments as `temperature=0,stop=["stop"]` or `key=val` `key2=val2`.'
+                "Values should be parsable with ast.literal_eval."
            ),
        )

@@ -167,9 +168,10 @@ class Run(SubCommand):
            "-E",
            default=None,
            type=try_parse_json,
-            metavar="JSON_FILE",
+            metavar='"task1": [1,2,3,4,...]"',
            help=textwrap.dedent(
-                'JSON file with specific sample indices for inputs: {"task_name":[indices],...}. Incompatible with --limit.'
+                "`...` `...` Sample indices for inputs. Incompatible with --limit."
+                " Values be parsable with ast.literal_eval."
            ),
        )

@@ -314,9 +316,10 @@ class Run(SubCommand):
            "--metadata",
            type=json.loads,
            default=None,
-            metavar="JSON",
+            metavar="`key=val` `key2=val2`",
            help=textwrap.dedent(
-                """JSON metadata for task configs (merged with model_args), required for some tasks such as RULER"""
+                """`key=val` `key2=val` args parsable by ast.literal_eval (merged with model_args),
+                required for some tasks such as RULER"""
            ),
        )


--- a/lm_eval/config/evaluate_config.py
+++ b/lm_eval/config/evaluate_config.py
@@ -21,6 +21,7 @@ DICT_KEYS = [
    "hf_hub_log_args",
    "metadata",
    "model_args",
+    "gen_kwargs",
 ]


@@ -79,7 +80,7 @@ class EvaluatorConfig:

    # Device
    device: Optional[str] = field(
-        default=None, metadata={"help": "Device to use (e.g. cuda, cuda:0, cpu)"}
+        default="cuda:0", metadata={"help": "Device to use (e.g. cuda, cuda:0, cpu)"}
    )

    # Data sampling and limiting
@@ -126,7 +127,10 @@ class EvaluatorConfig:
        default=None, metadata={"help": "Custom System instruction to add"}
    )
    apply_chat_template: Union[bool, str] = field(
-        default=False, metadata={"help": "Apply chat template to prompt"}
+        default=False,
+        metadata={
+            "help": "Apply chat template to prompt. Either True, or a string identifying the tokenizer template."
+        },
    )
    fewshot_as_multiturn: bool = field(
        default=False,
@@ -170,7 +174,7 @@ class EvaluatorConfig:
        metadata={"help": "Seeds for random, numpy, torch, fewshot (random)"},
    )

-    # Security and safety
+    # Security
    trust_remote_code: bool = field(
        default=False, metadata={"help": "Trust remote code for HF datasets"}
    )
@@ -201,7 +205,7 @@ class EvaluatorConfig:
            config.update(cls.load_yaml_config(namespace.config))

        # Override with CLI args (only truthy values, exclude non-config args)
-        excluded_args = {"config", "command", "func"}  # argparse internal args
+        excluded_args = {"command", "func"}  # argparse internal args
        cli_args = {
            k: v for k, v in vars(namespace).items() if v and k not in excluded_args
        }
@@ -252,7 +256,6 @@ class EvaluatorConfig:

        try:
            yaml_data = yaml.safe_load(config_file.read_text())
-            print(textwrap.dedent(f"""yaml: {yaml_data}"""))
        except yaml.YAMLError as e:
            raise ValueError(f"Invalid YAML in {config_path}: {e}")
        except (OSError, UnicodeDecodeError) as e:
@@ -337,17 +340,10 @@ class EvaluatorConfig:
            metadata=self.metadata if self.metadata else {},
        )

-        # self.tasks is a comma-separated string of task names
-        if isinstance((task_list := self.tasks), str):
-            task_list = self.tasks.split(",")
-        else:
-            assert isinstance(self.tasks, list), (
-                "`tasks` must be a comma delimited string of task names or list[str]."
-            )
-        task_names = task_manager.match_tasks(task_list)
+        task_names = task_manager.match_tasks(self.tasks)

        # Check for any individual task files in the list
-        for task in [task for task in task_list if task not in task_names]:
+        for task in [task for task in self.tasks if task not in task_names]:
            task_path = Path(task)
            if task_path.is_file():
                config = utils.load_yaml_config(str(task_path))
@@ -355,7 +351,7 @@ class EvaluatorConfig:

        # Check for missing tasks
        task_missing = [
-            task for task in task_list if task not in task_names and "*" not in task
+            task for task in self.tasks if task not in task_names and "*" not in task
        ]

        if task_missing:

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -467,7 +467,9 @@ def evaluate(
            "Either 'limit' or 'samples' must be None, but both are not None."
        )
    if samples is not None:
-        eval_logger.info(f"Evaluating examples for tasks {list(samples.keys())}")
+        eval_logger.info(
+            f"Evaluating examples for tasks {[x for x in list(samples.keys()) if x in task_dict.keys()]}"
+        )
    if apply_chat_template:
        eval_logger.warning(
            "Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."

--- a/templates/example_ci_config.yaml
+++ b/templates/example_ci_config.yaml
@@ -6,9 +6,10 @@
 # Usage:
 #   $ lm_eval --config templates/example_ci_config.yaml
 #
-# You can override any values in this config with command-line arguments:
+# You can override any values in this config with further command-line arguments:
 #   $ lm_eval --config templates/example_ci_config.yaml --model_args pretrained=gpt2 --tasks mmlu
 #
+# For expected types and values, refer to EvaluatorConfig in lm_eval/config/evaluate_config.py
 # All parameters are optional and have the same meaning as their CLI counterparts.

 model: hf
@@ -19,13 +20,13 @@ tasks:
  - hellaswag
  - arc_easy
 batch_size: 1
-device: mps
 trust_remote_code: true
 log_samples: true
 output_path: ./test
 gen_kwargs:
  do_sample: true
  temperature: 0.7
+  stop: ["\n", "<|endoftext|>"]
 samples:
  hellaswag: [1,2,3,4,5,6,7,8,9,10]
  arc_easy: [10,20,30,40,50,60,70,80,90,100]