cleanup

9de93651 · Baber · febdcc5b · 9de93651 · 9de93651 · febdcc5b
Commit 9de93651 authored Jul 04, 2025 by Baber
7 changed files
--- a/lm_eval/_cli/__init__.py
+++ b/lm_eval/_cli/__init__.py
@@ -3,7 +3,6 @@ CLI subcommands for the Language Model Evaluation Harness.
 """
 from lm_eval._cli.base import SubCommand
-from lm_eval._cli.cache import Cache
 from lm_eval._cli.cli import CLIParser
 from lm_eval._cli.list import ListCommand
 from lm_eval._cli.run import Run
@@ -15,6 +14,5 @@ __all__ = [
    "Run",
    "ListCommand",
    "ValidateCommand",
-    "Cache",
    "CLIParser",
 ]
--- a/lm_eval/_cli/base.py
+++ b/lm_eval/_cli/base.py
@@ -14,7 +14,7 @@ class SubCommand(ABC):
        return cls(subparsers)
    @abstractmethod
-    def _add_args(self, parser: argparse.ArgumentParser) -> None:
+    def _add_args(self) -> None:
        """Add arguments specific to this subcommand."""
        pass

--- a/lm_eval/_cli/cache.py
+++ b/lm_eval/_cli/cache.py
-import argparse
-from lm_eval._cli.base import SubCommand
-class Cache(SubCommand):
-    """Command for cache management."""
-    def __init__(self, subparsers: argparse._SubParsersAction, *args, **kwargs):
-        # Create and configure the parser
-        super().__init__(*args, **kwargs)
-        parser = subparsers.add_parser(
-            "cache",
-            help="Manage evaluation cache",
-            description="Manage evaluation cache files and directories.",
-            epilog="""
-Examples:
-  lm-eval cache clear --cache_path ./cache.db     # Clear cache file
-  lm-eval cache info --cache_path ./cache.db      # Show cache info
-  lm-eval cache clear --cache_path ./cache_dir/   # Clear cache directory
-            """,
-            formatter_class=argparse.RawDescriptionHelpFormatter,
-        )
-        # Add command-specific arguments
-        self._add_args(parser)
-        # Set the function to execute for this subcommand
-        parser.set_defaults(func=self.execute)
-    def _add_args(self, parser: argparse.ArgumentParser) -> None:
-        parser.add_argument(
-            "action",
-            choices=["clear", "info"],
-            help="Action to perform: clear or info",
-        )
-        parser.add_argument(
-            "--cache_path",
-            type=str,
-            default=None,
-            help="Path to cache directory or file",
-        )
-    def execute(self, args: argparse.Namespace) -> None:
-        """Execute the cache command."""
-        raise NotImplementedError
--- a/lm_eval/_cli/cli.py
+++ b/lm_eval/_cli/cli.py
 import argparse
 import sys
+import textwrap
-from lm_eval._cli.cache import Cache
-from lm_eval._cli.run import Run
 from lm_eval._cli.list import ListCommand
+from lm_eval._cli.run import Run
 from lm_eval._cli.validate import ValidateCommand
@@ -14,7 +14,31 @@ class CLIParser:
        self._parser = argparse.ArgumentParser(
            prog="lm-eval",
            description="Language Model Evaluation Harness",
-            formatter_class=argparse.RawTextHelpFormatter,
+            epilog=textwrap.dedent("""
+                quick start:
+                  # Basic evaluation
+                  lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag
+                  # List available tasks
+                  lm-eval list tasks
+                  # Validate task configurations
+                  lm-eval validate --tasks hellaswag,arc_easy
+                available commands:
+                  run       Run the harness on specified tasks
+                  list      List available tasks, groups, subtasks, or tags
+                  validate  Validate task configurations and check for errors
+                legacy compatibility:
+                  The harness maintains backward compatibility with the original interface.
+                  If no command is specified, 'run' is automatically inserted:
+                  lm-eval --model hf --tasks hellaswag  # Equivalent to 'lm-eval run --model hf --tasks hellaswag'
+                For documentation, visit: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md
+            """),
+            formatter_class=argparse.RawDescriptionHelpFormatter,
        )
        self._parser.set_defaults(func=lambda args: self._parser.print_help())
        self._subparsers = self._parser.add_subparsers(
@@ -23,7 +47,6 @@ class CLIParser:
        Run.create(self._subparsers)
        ListCommand.create(self._subparsers)
        ValidateCommand.create(self._subparsers)
-        Cache.create(self._subparsers)
    def parse_args(self) -> argparse.Namespace:
        """Parse arguments using the main parser."""

--- a/lm_eval/_cli/list.py
+++ b/lm_eval/_cli/list.py
 import argparse
+import textwrap
 from lm_eval._cli.base import SubCommand
@@ -9,30 +10,51 @@ class ListCommand(SubCommand):
    def __init__(self, subparsers: argparse._SubParsersAction, *args, **kwargs):
        # Create and configure the parser
        super().__init__(*args, **kwargs)
-        parser = subparsers.add_parser(
+        self._parser = subparsers.add_parser(
            "list",
            help="List available tasks, groups, subtasks, or tags",
            description="List available tasks, groups, subtasks, or tags from the evaluation harness.",
-            epilog="""
+            epilog=textwrap.dedent("""
-Examples:
+                examples:
-  lm-eval list tasks         # List all available tasks
+                  # List all available tasks (includes groups, subtasks, and tags)
-  lm-eval list groups        # List task groups only
+                  $ lm-eval list tasks
-  lm-eval list subtasks      # List subtasks only
-  lm-eval list tags          # List available tags
+                  # List only task groups (like 'mmlu', 'glue', 'superglue')
-  lm-eval list tasks --include_path /path/to/external/tasks
+                  $ lm-eval list groups
-            """,
+                  # List only individual subtasks (like 'mmlu_abstract_algebra')
+                  $ lm-eval list subtasks
+                  # Include external task definitions
+                  $ lm-eval list tasks --include_path /path/to/external/tasks
+                  # List tasks from multiple external paths
+                  $ lm-eval list tasks --include_path "/path/to/tasks1:/path/to/tasks2"
+                organization:
+                  • Groups: Collections of tasks with aggregated metric across subtasks (e.g., 'mmlu')
+                  • Subtasks: Individual evaluation tasks (e.g., 'mmlu_anatomy', 'hellaswag')
+                  • Tags: Similar to groups but no aggregate metric (e.g., 'reasoning', 'knowledge', 'language')
+                  • External Tasks: Custom tasks defined in external directories
+                evaluation usage:
+                  After listing tasks, use them with the run command!
+                For more information tasks configs are defined in https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks
+            """),
            formatter_class=argparse.RawDescriptionHelpFormatter,
        )
-        self._add_args(parser)
+        self._add_args()
-        parser.set_defaults(func=self.execute)
+        self._parser.set_defaults(func=lambda arg: self._parser.print_help())
-    def _add_args(self, parser: argparse.ArgumentParser) -> None:
+    def _add_args(self) -> None:
-        parser.add_argument(
+        self._parser.add_argument(
            "what",
            choices=["tasks", "groups", "subtasks", "tags"],
+            nargs="?",
            help="What to list: tasks (all), groups, subtasks, or tags",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--include_path",
            type=str,
            default=None,
@@ -54,3 +76,5 @@ Examples:
            print(task_manager.list_all_tasks(list_groups=False, list_tags=False))
        elif args.what == "tags":
            print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False))
+        elif args.what is None:
+            self._parser.print_help()
--- a/lm_eval/_cli/run.py
+++ b/lm_eval/_cli/run.py
@@ -2,6 +2,7 @@ import argparse
 import json
 import logging
 import os
+import textwrap
 from functools import partial
 from lm_eval._cli import SubCommand
@@ -18,27 +19,34 @@ class Run(SubCommand):
    def __init__(self, subparsers: argparse._SubParsersAction, *args, **kwargs):
        # Create and configure the parser
        super().__init__(*args, **kwargs)
-        parser = subparsers.add_parser(
+        self._parser = subparsers.add_parser(
            "run",
            help="Run language model evaluation",
            description="Evaluate language models on various benchmarks and tasks.",
-            epilog="""
+            epilog=textwrap.dedent("""
-Examples:
+                examples:
-  lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag
+                  # Basic evaluation with HuggingFace model
-  lm-eval run --config my_config.yaml --tasks arc_easy,arc_challenge
+                  $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag
-  lm-eval run --model openai --tasks mmlu --num_fewshot 5
-            """,
+                  # Evaluate on multiple tasks with few-shot examples
+                  $ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy,arc_challenge --num_fewshot 5
+                  # Evaluation with custom generation parameters
+                  $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs "temperature=0.8,top_p=0.95"
+                  # Use configuration file
+                  $ lm-eval run --config my_config.yaml --tasks mmlu
+                For more information, see: https://github.com/EleutherAI/lm-evaluation-harness
+            """),
            formatter_class=argparse.RawDescriptionHelpFormatter,
        )
+        self._add_args()
+        self._parser.set_defaults(func=lambda args: self._parser.print_help())
-        # Add command-specific arguments
+    def _add_args(self) -> None:
-        self._add_args(parser)
+        self._parser = self._parser
+        self._parser.add_argument(
-        # Set the function to execute for this subcommand
-        parser.set_defaults(func=self.execute)
-    def _add_args(self, parser: argparse.ArgumentParser) -> None:
-        parser.add_argument(
            "--config",
            "-C",
            default=None,
@@ -46,14 +54,14 @@ Examples:
            metavar="DIR/file.yaml",
            help="Path to config with all arguments for `lm-eval`",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--model",
            "-m",
            type=str,
            default="hf",
            help="Name of model. Default 'hf'",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--tasks",
            "-t",
            default=None,
@@ -61,14 +69,14 @@ Examples:
            metavar="task1,task2",
            help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--model_args",
            "-a",
            default=None,
            type=try_parse_json,
            help="""Comma separated string or JSON formatted arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32` or '{"pretrained":"EleutherAI/pythia-160m","dtype":"float32"}'.""",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--num_fewshot",
            "-f",
            type=int,
@@ -76,7 +84,7 @@ Examples:
            metavar="N",
            help="Number of examples in few-shot context",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--batch_size",
            "-b",
            type=str,
@@ -84,20 +92,20 @@ Examples:
            metavar="auto|auto:N|N",
            help="Acceptable values are 'auto', 'auto:N' (recompute batchsize N times with time) or N, where N is an integer. Default 1.",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--max_batch_size",
            type=int,
            default=None,
            metavar="N",
            help="Maximal batch size to try with --batch_size auto.",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--device",
            type=str,
            default=None,
            help="Device to use (e.g. cuda, cuda:0, cpu). Model defaults. Default None.",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--output_path",
            "-o",
            default=None,
@@ -105,7 +113,7 @@ Examples:
            metavar="DIR|DIR/file.json",
            help="Path where result metrics will be saved. Can be either a directory or a .json file. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--limit",
            "-L",
            type=float,
@@ -114,7 +122,7 @@ Examples:
            help="Limit the number of examples per task. "
            "If <1, limit is a percentage of the total number of examples.",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--samples",
            "-E",
            default=None,
@@ -122,7 +130,7 @@ Examples:
            metavar="/path/to/json",
            help='JSON string or path to JSON file containing doc indices of selected examples to test. Format: {"task_name":[indices],...}',
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--use_cache",
            "-c",
            type=str,
@@ -130,40 +138,40 @@ Examples:
            metavar="DIR",
            help="A path to a sqlite db file for caching model responses. `None` if not caching.",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--cache_requests",
            type=request_caching_arg_to_dict,
            default=None,
            choices=["true", "refresh", "delete"],
            help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--check_integrity",
            action="store_true",
            default=argparse.SUPPRESS,
            help="Whether to run the relevant part of the test suite for the tasks.",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--write_out",
            "-w",
            action="store_true",
            default=argparse.SUPPRESS,
            help="Prints the prompt for the first few documents.",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--log_samples",
            "-s",
            action="store_true",
            default=argparse.SUPPRESS,
            help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--system_instruction",
            type=str,
            default=None,
            help="System instruction to be used in the prompt",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--apply_chat_template",
            type=str,
            nargs="?",
@@ -176,26 +184,26 @@ Examples:
                "E.g. `--apply_chat_template template_name`"
            ),
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--fewshot_as_multiturn",
            action="store_true",
            default=argparse.SUPPRESS,
            help="If True, uses the fewshot as a multi-turn conversation",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--show_config",
            action="store_true",
            default=argparse.SUPPRESS,
            help="If True, shows the the full config of all tasks at the end of the evaluation.",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--include_path",
            type=str,
            default=None,
            metavar="DIR",
            help="Additional path to include if there are external tasks to include.",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--gen_kwargs",
            type=try_parse_json,
            default=None,
@@ -204,7 +212,7 @@ Examples:
                """ e.g. '{"do_sample": True, temperature":0.7,"until":["hello"]}' or temperature=0,top_p=0.1."""
            ),
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--verbosity",
            "-v",
            type=str.upper,
@@ -212,25 +220,25 @@ Examples:
            metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
            help="(Deprecated) Controls logging verbosity level. Use the `LOGLEVEL` environment variable instead. Set to DEBUG for detailed output when testing or adding new task configurations.",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--wandb_args",
            type=str,
            default=argparse.SUPPRESS,
            help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval`",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--wandb_config_args",
            type=str,
            default=argparse.SUPPRESS,
            help="Comma separated string arguments passed to wandb.config.update. Use this to trace parameters that aren't already traced by default. eg. `lr=0.01,repeats=3`",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--hf_hub_log_args",
            type=str,
            default=argparse.SUPPRESS,
            help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--predict_only",
            "-x",
            action="store_true",
@@ -238,7 +246,7 @@ Examples:
            help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
        )
        default_seed_string = "0,1234,1234,1234"
-        parser.add_argument(
+        self._parser.add_argument(
            "--seed",
            type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
            default=default_seed_string,  # for backward compatibility
@@ -253,19 +261,19 @@ Examples:
                "E.g, `--seed 42` sets all four seeds to 42."
            ),
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--trust_remote_code",
            action="store_true",
            default=argparse.SUPPRESS,
            help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--confirm_run_unsafe_code",
            action="store_true",
            default=argparse.SUPPRESS,
            help="Confirm that you understand the risks of running unsafe code for tasks that require it",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--metadata",
            type=json.loads,
            default=None,

--- a/lm_eval/_cli/validate.py
+++ b/lm_eval/_cli/validate.py
 import argparse
 import sys
+import textwrap
 from lm_eval._cli.base import SubCommand
@@ -8,29 +9,73 @@ class ValidateCommand(SubCommand):
    """Command for validating tasks."""
    def __init__(self, subparsers: argparse._SubParsersAction, *args, **kwargs):
-        # Create and configure the parser
+        # Create and configure the self._parser
        super().__init__(*args, **kwargs)
-        parser = subparsers.add_parser(
+        self._parser = subparsers.add_parser(
            "validate",
            help="Validate task configurations",
            description="Validate task configurations and check for errors.",
-            epilog="""
+            epilog=textwrap.dedent("""
-Examples:
+                examples:
-  lm-eval validate --tasks hellaswag              # Validate single task
+                  # Validate a single task
-  lm-eval validate --tasks arc_easy,arc_challenge # Validate multiple tasks
+                  lm-eval validate --tasks hellaswag
-  lm-eval validate --tasks mmlu --include_path ./custom_tasks
-            """,
+                  # Validate multiple tasks
+                  lm-eval validate --tasks arc_easy,arc_challenge,hellaswag
+                  # Validate a task group
+                  lm-eval validate --tasks mmlu
+                  # Validate tasks with external definitions
+                  lm-eval validate --tasks my_custom_task --include_path ./custom_tasks
+                  # Validate tasks from multiple external paths
+                  lm-eval validate --tasks custom_task1,custom_task2 --include_path "/path/to/tasks1:/path/to/tasks2"
+                validation check:
+                  The validate command performs several checks:
+                  • Task existence: Verifies all specified tasks are available
+                  • Configuration syntax: Checks YAML/JSON configuration files
+                  • Dataset access: Validates dataset paths and configurations
+                  • Required fields: Ensures all mandatory task parameters are present
+                  • Metric definitions: Verifies metric functions and aggregation methods
+                  • Filter pipelines: Validates filter chains and their parameters
+                  • Template rendering: Tests prompt templates with sample data
+                task config files:
+                  Tasks are defined using YAML configuration files with these key sections:
+                  • task: Task name and metadata
+                  • dataset_path: HuggingFace dataset identifier
+                  • doc_to_text: Template for converting documents to prompts
+                  • doc_to_target: Template for extracting target answers
+                  • metric_list: List of evaluation metrics to compute
+                  • output_type: Type of model output (loglikelihood, generate_until, etc.)
+                  • filter_list: Post-processing filters for model outputs
+                common errors:
+                  • Missing required fields in YAML configuration
+                  • Invalid dataset paths or missing dataset splits
+                  • Malformed Jinja2 templates in doc_to_text/doc_to_target
+                  • Undefined metrics or aggregation functions
+                  • Invalid filter names or parameters
+                  • Circular dependencies in task inheritance
+                  • Missing external task files when using --include_path
+                debugging tips:
+                  • Use --include_path to test external task definitions
+                  • Check task configuration files for syntax errors
+                  • Verify dataset access and authentication if needed
+                  • Use 'lm-eval list tasks' to see available tasks
+                For task configuration guide, see: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md
+            """),
            formatter_class=argparse.RawDescriptionHelpFormatter,
        )
+        self._add_args()
+        self._parser.set_defaults(func=lambda args: self._parser.print_help())
-        # Add command-specific arguments
+    def _add_args(self) -> None:
-        self._add_args(parser)
+        self._parser.add_argument(
-        # Set the function to execute for this subcommand
-        parser.set_defaults(func=self.execute)
-    def _add_args(self, parser: argparse.ArgumentParser) -> None:
-        parser.add_argument(
            "--tasks",
            "-t",
            required=True,
@@ -38,7 +83,7 @@ Examples:
            metavar="task1,task2",
            help="Comma-separated list of task names to validate",
        )
-        parser.add_argument(
+        self._parser.add_argument(
            "--include_path",
            type=str,
            default=None,