fix help

768f55b3 · Baber · c59d4e2a · 768f55b3
Commit 768f55b3 authored Jul 04, 2025 by Baber
Hide whitespace changes
Inline Side-by-side

Showing with 140 additions and 107 deletions

lm_eval/_cli/run.py lm_eval/_cli/run.py +140 -107

No files found.
--- a/lm_eval/_cli/run.py
+++ b/lm_eval/_cli/run.py
@@ -46,37 +46,51 @@ class Run(SubCommand):
    def _add_args(self) -> None:
        self._parser = self._parser
-        self._parser.add_argument(
+        # Configuration
+        config_group = self._parser.add_argument_group("configuration")
+        config_group.add_argument(
            "--config",
            "-C",
            default=None,
            type=str,
-            metavar="DIR/file.yaml",
+            metavar="YAML_PATH",
-            help="Path to config with all arguments for `lm-eval`",
+            help="Set initial arguments from YAML config",
        )
-        self._parser.add_argument(
+        # Model and Tasks
+        model_group = self._parser.add_argument_group("model and tasks")
+        model_group.add_argument(
            "--model",
            "-m",
            type=str,
            default="hf",
-            help="Name of model. Default 'hf'",
+            metavar="MODEL_NAME",
+            help="Model name (default: hf)",
        )
-        self._parser.add_argument(
+        model_group.add_argument(
            "--tasks",
            "-t",
            default=None,
            type=str,
-            metavar="task1,task2",
+            metavar="TASK1,TASK2",
-            help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above",
+            help=textwrap.dedent("""
+                Comma-separated list of task names or groupings.
+                Use 'lm-eval list tasks' to see all available tasks.
+            """).strip(),
        )
-        self._parser.add_argument(
+        model_group.add_argument(
            "--model_args",
            "-a",
            default=None,
            type=try_parse_json,
-            help="""Comma separated string or JSON formatted arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32` or '{"pretrained":"EleutherAI/pythia-160m","dtype":"float32"}'.""",
+            metavar="ARGS",
+            help="Model arguments as 'key=val,key2=val2' or JSON string",
        )
-        self._parser.add_argument(
+        # Evaluation Settings
+        eval_group = self._parser.add_argument_group("evaluation settings")
+        eval_group.add_argument(
            "--num_fewshot",
            "-f",
            type=int,
@@ -84,200 +98,219 @@ class Run(SubCommand):
            metavar="N",
            help="Number of examples in few-shot context",
        )
-        self._parser.add_argument(
+        eval_group.add_argument(
            "--batch_size",
            "-b",
            type=str,
            default=argparse.SUPPRESS,
            metavar="auto|auto:N|N",
-            help="Acceptable values are 'auto', 'auto:N' (recompute batchsize N times with time) or N, where N is an integer. Default 1.",
+            help=textwrap.dedent(
+                "Batch size: 'auto', 'auto:N' (auto-tune N times), or integer (default: 1)"
+            ),
        )
-        self._parser.add_argument(
+        eval_group.add_argument(
            "--max_batch_size",
            type=int,
            default=None,
            metavar="N",
-            help="Maximal batch size to try with --batch_size auto.",
+            help="Maximum batch size when using --batch_size auto",
        )
-        self._parser.add_argument(
+        eval_group.add_argument(
            "--device",
            type=str,
            default=None,
-            help="Device to use (e.g. cuda, cuda:0, cpu). Model defaults. Default None.",
+            metavar="DEVICE",
+            help="Device to use (e.g. cuda, cuda:0, cpu, mps)",
        )
-        self._parser.add_argument(
+        eval_group.add_argument(
+            "--gen_kwargs",
+            type=try_parse_json,
+            default=None,
+            metavar="KWARGS",
+            help="Generation arguments as 'key=val,key2=val2' or JSON string",
+        )
+        # Data and Output
+        data_group = self._parser.add_argument_group("data and output")
+        data_group.add_argument(
            "--output_path",
            "-o",
            default=None,
            type=str,
-            metavar="DIR|DIR/file.json",
+            metavar="OUTPUT_PATH",
-            help="Path where result metrics will be saved. Can be either a directory or a .json file. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
+            help="Output dir or json file for results (and samples)",
+        )
+        data_group.add_argument(
+            "--log_samples",
+            "-s",
+            action="store_true",
+            default=argparse.SUPPRESS,
+            help="Save all model outputs and documents for post-hoc analysis",
        )
-        self._parser.add_argument(
+        data_group.add_argument(
            "--limit",
            "-L",
            type=float,
            default=None,
-            metavar="N|0<N<1",
+            metavar="N|0.0-1.0",
-            help="Limit the number of examples per task. "
+            help="Limit examples per task (integer count or fraction)",
-            "If <1, limit is a percentage of the total number of examples.",
        )
-        self._parser.add_argument(
+        data_group.add_argument(
            "--samples",
            "-E",
            default=None,
            type=try_parse_json,
-            metavar="/path/to/json",
+            metavar="JSON_FILE",
-            help='JSON string or path to JSON file containing doc indices of selected examples to test. Format: {"task_name":[indices],...}',
+            help=textwrap.dedent(
+                'JSON file with specific sample indices for inputs: {"task_name":[indices],...}. Incompatible with --limit.'
+            ),
        )
-        self._parser.add_argument(
+        # Caching and Performance
+        cache_group = self._parser.add_argument_group("caching and performance")
+        cache_group.add_argument(
            "--use_cache",
            "-c",
            type=str,
            default=None,
-            metavar="DIR",
+            metavar="CACHE_DIR",
-            help="A path to a sqlite db file for caching model responses. `None` if not caching.",
+            help="SQLite database path for caching model outputs.",
        )
-        self._parser.add_argument(
+        cache_group.add_argument(
            "--cache_requests",
            type=request_caching_arg_to_dict,
            default=None,
            choices=["true", "refresh", "delete"],
-            help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
+            help="Cache dataset request building (true|refresh|delete)",
        )
-        self._parser.add_argument(
+        cache_group.add_argument(
            "--check_integrity",
            action="store_true",
            default=argparse.SUPPRESS,
-            help="Whether to run the relevant part of the test suite for the tasks.",
+            help="Run task test suite validation",
-        )
-        self._parser.add_argument(
-            "--write_out",
-            "-w",
-            action="store_true",
-            default=argparse.SUPPRESS,
-            help="Prints the prompt for the first few documents.",
-        )
-        self._parser.add_argument(
-            "--log_samples",
-            "-s",
-            action="store_true",
-            default=argparse.SUPPRESS,
-            help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
        )
-        self._parser.add_argument(
+        # Prompt Formatting
+        template_group = self._parser.add_argument_group("instruct formatting")
+        template_group.add_argument(
            "--system_instruction",
            type=str,
            default=None,
-            help="System instruction to be used in the prompt",
+            metavar="INSTRUCTION",
+            help="Add custom system instruction.",
        )
-        self._parser.add_argument(
+        template_group.add_argument(
            "--apply_chat_template",
            type=str,
            nargs="?",
            const=True,
            default=argparse.SUPPRESS,
-            help=(
+            metavar="TEMPLATE",
-                "If True, apply chat template to the prompt. "
+            help="Apply chat template to prompts (optional template name)",
-                "Providing `--apply_chat_template` without an argument will apply the default chat template to the prompt. "
-                "To apply a specific template from the available list of templates, provide the template name as an argument. "
-                "E.g. `--apply_chat_template template_name`"
-            ),
        )
-        self._parser.add_argument(
+        template_group.add_argument(
            "--fewshot_as_multiturn",
            action="store_true",
            default=argparse.SUPPRESS,
-            help="If True, uses the fewshot as a multi-turn conversation",
+            help="Use fewshot examples as multi-turn conversation",
        )
-        self._parser.add_argument(
-            "--show_config",
+        # Task Management
-            action="store_true",
+        task_group = self._parser.add_argument_group("task management")
-            default=argparse.SUPPRESS,
+        task_group.add_argument(
-            help="If True, shows the the full config of all tasks at the end of the evaluation.",
-        )
-        self._parser.add_argument(
            "--include_path",
            type=str,
            default=None,
-            metavar="DIR",
+            metavar="TASK_DIR",
-            help="Additional path to include if there are external tasks to include.",
+            help="Additional directory for external tasks",
        )
-        self._parser.add_argument(
-            "--gen_kwargs",
+        # Logging and Tracking
-            type=try_parse_json,
+        logging_group = self._parser.add_argument_group("logging and tracking")
-            default=None,
+        logging_group.add_argument(
-            help=(
-                "Either comma delimited string or JSON formatted arguments for model generation on greedy_until tasks,"
-                """ e.g. '{"do_sample": True, temperature":0.7,"until":["hello"]}' or temperature=0,top_p=0.1."""
-            ),
-        )
-        self._parser.add_argument(
            "--verbosity",
            "-v",
            type=str.upper,
            default=None,
-            metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
+            metavar="LEVEL",
-            help="(Deprecated) Controls logging verbosity level. Use the `LOGLEVEL` environment variable instead. Set to DEBUG for detailed output when testing or adding new task configurations.",
+            help="(Deprecated) Log level. Use LOGLEVEL env var instead",
        )
-        self._parser.add_argument(
+        logging_group.add_argument(
+            "--write_out",
+            "-w",
+            action="store_true",
+            default=argparse.SUPPRESS,
+            help="Print prompts for first few documents",
+        )
+        logging_group.add_argument(
+            "--show_config",
+            action="store_true",
+            default=argparse.SUPPRESS,
+            help="Display full task configuration after evaluation",
+        )
+        logging_group.add_argument(
            "--wandb_args",
            type=str,
            default=argparse.SUPPRESS,
-            help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval`",
+            metavar="ARGS",
+            help="Weights & Biases init arguments (key=val,key2=val2)",
        )
-        self._parser.add_argument(
+        logging_group.add_argument(
            "--wandb_config_args",
            type=str,
            default=argparse.SUPPRESS,
-            help="Comma separated string arguments passed to wandb.config.update. Use this to trace parameters that aren't already traced by default. eg. `lr=0.01,repeats=3`",
+            metavar="ARGS",
+            help="Weights & Biases config arguments (key=val,key2=val2)",
        )
-        self._parser.add_argument(
+        logging_group.add_argument(
            "--hf_hub_log_args",
            type=str,
            default=argparse.SUPPRESS,
-            help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
+            metavar="ARGS",
+            help="Hugging Face Hub logging arguments (key=val,key2=val2)",
        )
-        self._parser.add_argument(
+        # Advanced Options
+        advanced_group = self._parser.add_argument_group("advanced options")
+        advanced_group.add_argument(
            "--predict_only",
            "-x",
            action="store_true",
            default=argparse.SUPPRESS,
-            help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
+            help="Save predictions only, skip metric computation",
        )
        default_seed_string = "0,1234,1234,1234"
-        self._parser.add_argument(
+        advanced_group.add_argument(
            "--seed",
            type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
-            default=default_seed_string,  # for backward compatibility
+            default=default_seed_string,
-            help=(
+            metavar="SEED|S1,S2,S3,S4",
-                "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
+            help=textwrap.dedent(f"""
-                "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
+                Random seeds for python,numpy,torch,fewshot (default: {default_seed_string}).
-                "respectively, or a single integer to set the same seed for all four.\n"
+                Use single integer for all, or comma-separated list of 4 values.
-                f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
+                Use 'None' to skip setting a seed. Example: --seed 42 or --seed 0,None,8,52
-                "(for backward compatibility).\n"
+            """).strip(),
-                "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
+        )
-                "Here numpy's seed is not set since the second value is `None`.\n"
+        advanced_group.add_argument(
-                "E.g, `--seed 42` sets all four seeds to 42."
-            ),
-        )
-        self._parser.add_argument(
            "--trust_remote_code",
            action="store_true",
            default=argparse.SUPPRESS,
-            help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
+            help="Allow executing remote code from Hugging Face Hub",
        )
-        self._parser.add_argument(
+        advanced_group.add_argument(
            "--confirm_run_unsafe_code",
            action="store_true",
            default=argparse.SUPPRESS,
-            help="Confirm that you understand the risks of running unsafe code for tasks that require it",
+            help="Confirm understanding of unsafe code execution risks",
        )
-        self._parser.add_argument(
+        advanced_group.add_argument(
            "--metadata",
            type=json.loads,
            default=None,
-            help="""JSON string metadata to pass to task configs, for example '{"max_seq_lengths":[4096,8192]}'. Will be merged with model_args. Can also be set in task config.""",
+            metavar="JSON",
+            help=textwrap.dedent(
+                "JSON metadata for task configs (merged with model_args), required for some tasks such as RULER"
+            ),
        )
    def execute(self, args: argparse.Namespace) -> None: