Commit 9de93651 authored by Baber's avatar Baber
Browse files

cleanup

parent febdcc5b
...@@ -3,7 +3,6 @@ CLI subcommands for the Language Model Evaluation Harness. ...@@ -3,7 +3,6 @@ CLI subcommands for the Language Model Evaluation Harness.
""" """
from lm_eval._cli.base import SubCommand from lm_eval._cli.base import SubCommand
from lm_eval._cli.cache import Cache
from lm_eval._cli.cli import CLIParser from lm_eval._cli.cli import CLIParser
from lm_eval._cli.list import ListCommand from lm_eval._cli.list import ListCommand
from lm_eval._cli.run import Run from lm_eval._cli.run import Run
...@@ -15,6 +14,5 @@ __all__ = [ ...@@ -15,6 +14,5 @@ __all__ = [
"Run", "Run",
"ListCommand", "ListCommand",
"ValidateCommand", "ValidateCommand",
"Cache",
"CLIParser", "CLIParser",
] ]
...@@ -14,7 +14,7 @@ class SubCommand(ABC): ...@@ -14,7 +14,7 @@ class SubCommand(ABC):
return cls(subparsers) return cls(subparsers)
@abstractmethod @abstractmethod
def _add_args(self, parser: argparse.ArgumentParser) -> None: def _add_args(self) -> None:
"""Add arguments specific to this subcommand.""" """Add arguments specific to this subcommand."""
pass pass
......
import argparse
from lm_eval._cli.base import SubCommand
class Cache(SubCommand):
"""Command for cache management."""
def __init__(self, subparsers: argparse._SubParsersAction, *args, **kwargs):
# Create and configure the parser
super().__init__(*args, **kwargs)
parser = subparsers.add_parser(
"cache",
help="Manage evaluation cache",
description="Manage evaluation cache files and directories.",
epilog="""
Examples:
lm-eval cache clear --cache_path ./cache.db # Clear cache file
lm-eval cache info --cache_path ./cache.db # Show cache info
lm-eval cache clear --cache_path ./cache_dir/ # Clear cache directory
""",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
# Add command-specific arguments
self._add_args(parser)
# Set the function to execute for this subcommand
parser.set_defaults(func=self.execute)
def _add_args(self, parser: argparse.ArgumentParser) -> None:
parser.add_argument(
"action",
choices=["clear", "info"],
help="Action to perform: clear or info",
)
parser.add_argument(
"--cache_path",
type=str,
default=None,
help="Path to cache directory or file",
)
def execute(self, args: argparse.Namespace) -> None:
"""Execute the cache command."""
raise NotImplementedError
import argparse import argparse
import sys import sys
import textwrap
from lm_eval._cli.cache import Cache
from lm_eval._cli.run import Run
from lm_eval._cli.list import ListCommand from lm_eval._cli.list import ListCommand
from lm_eval._cli.run import Run
from lm_eval._cli.validate import ValidateCommand from lm_eval._cli.validate import ValidateCommand
...@@ -14,7 +14,31 @@ class CLIParser: ...@@ -14,7 +14,31 @@ class CLIParser:
self._parser = argparse.ArgumentParser( self._parser = argparse.ArgumentParser(
prog="lm-eval", prog="lm-eval",
description="Language Model Evaluation Harness", description="Language Model Evaluation Harness",
formatter_class=argparse.RawTextHelpFormatter, epilog=textwrap.dedent("""
quick start:
# Basic evaluation
lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag
# List available tasks
lm-eval list tasks
# Validate task configurations
lm-eval validate --tasks hellaswag,arc_easy
available commands:
run Run the harness on specified tasks
list List available tasks, groups, subtasks, or tags
validate Validate task configurations and check for errors
legacy compatibility:
The harness maintains backward compatibility with the original interface.
If no command is specified, 'run' is automatically inserted:
lm-eval --model hf --tasks hellaswag # Equivalent to 'lm-eval run --model hf --tasks hellaswag'
For documentation, visit: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md
"""),
formatter_class=argparse.RawDescriptionHelpFormatter,
) )
self._parser.set_defaults(func=lambda args: self._parser.print_help()) self._parser.set_defaults(func=lambda args: self._parser.print_help())
self._subparsers = self._parser.add_subparsers( self._subparsers = self._parser.add_subparsers(
...@@ -23,7 +47,6 @@ class CLIParser: ...@@ -23,7 +47,6 @@ class CLIParser:
Run.create(self._subparsers) Run.create(self._subparsers)
ListCommand.create(self._subparsers) ListCommand.create(self._subparsers)
ValidateCommand.create(self._subparsers) ValidateCommand.create(self._subparsers)
Cache.create(self._subparsers)
def parse_args(self) -> argparse.Namespace: def parse_args(self) -> argparse.Namespace:
"""Parse arguments using the main parser.""" """Parse arguments using the main parser."""
......
import argparse import argparse
import textwrap
from lm_eval._cli.base import SubCommand from lm_eval._cli.base import SubCommand
...@@ -9,30 +10,51 @@ class ListCommand(SubCommand): ...@@ -9,30 +10,51 @@ class ListCommand(SubCommand):
def __init__(self, subparsers: argparse._SubParsersAction, *args, **kwargs): def __init__(self, subparsers: argparse._SubParsersAction, *args, **kwargs):
# Create and configure the parser # Create and configure the parser
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
parser = subparsers.add_parser( self._parser = subparsers.add_parser(
"list", "list",
help="List available tasks, groups, subtasks, or tags", help="List available tasks, groups, subtasks, or tags",
description="List available tasks, groups, subtasks, or tags from the evaluation harness.", description="List available tasks, groups, subtasks, or tags from the evaluation harness.",
epilog=""" epilog=textwrap.dedent("""
Examples: examples:
lm-eval list tasks # List all available tasks # List all available tasks (includes groups, subtasks, and tags)
lm-eval list groups # List task groups only $ lm-eval list tasks
lm-eval list subtasks # List subtasks only
lm-eval list tags # List available tags # List only task groups (like 'mmlu', 'glue', 'superglue')
lm-eval list tasks --include_path /path/to/external/tasks $ lm-eval list groups
""",
# List only individual subtasks (like 'mmlu_abstract_algebra')
$ lm-eval list subtasks
# Include external task definitions
$ lm-eval list tasks --include_path /path/to/external/tasks
# List tasks from multiple external paths
$ lm-eval list tasks --include_path "/path/to/tasks1:/path/to/tasks2"
organization:
• Groups: Collections of tasks with aggregated metric across subtasks (e.g., 'mmlu')
• Subtasks: Individual evaluation tasks (e.g., 'mmlu_anatomy', 'hellaswag')
• Tags: Similar to groups but no aggregate metric (e.g., 'reasoning', 'knowledge', 'language')
• External Tasks: Custom tasks defined in external directories
evaluation usage:
After listing tasks, use them with the run command!
For more information tasks configs are defined in https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks
"""),
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
) )
self._add_args(parser) self._add_args()
parser.set_defaults(func=self.execute) self._parser.set_defaults(func=lambda arg: self._parser.print_help())
def _add_args(self, parser: argparse.ArgumentParser) -> None: def _add_args(self) -> None:
parser.add_argument( self._parser.add_argument(
"what", "what",
choices=["tasks", "groups", "subtasks", "tags"], choices=["tasks", "groups", "subtasks", "tags"],
nargs="?",
help="What to list: tasks (all), groups, subtasks, or tags", help="What to list: tasks (all), groups, subtasks, or tags",
) )
parser.add_argument( self._parser.add_argument(
"--include_path", "--include_path",
type=str, type=str,
default=None, default=None,
...@@ -54,3 +76,5 @@ Examples: ...@@ -54,3 +76,5 @@ Examples:
print(task_manager.list_all_tasks(list_groups=False, list_tags=False)) print(task_manager.list_all_tasks(list_groups=False, list_tags=False))
elif args.what == "tags": elif args.what == "tags":
print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False)) print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False))
elif args.what is None:
self._parser.print_help()
...@@ -2,6 +2,7 @@ import argparse ...@@ -2,6 +2,7 @@ import argparse
import json import json
import logging import logging
import os import os
import textwrap
from functools import partial from functools import partial
from lm_eval._cli import SubCommand from lm_eval._cli import SubCommand
...@@ -18,27 +19,34 @@ class Run(SubCommand): ...@@ -18,27 +19,34 @@ class Run(SubCommand):
def __init__(self, subparsers: argparse._SubParsersAction, *args, **kwargs): def __init__(self, subparsers: argparse._SubParsersAction, *args, **kwargs):
# Create and configure the parser # Create and configure the parser
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
parser = subparsers.add_parser( self._parser = subparsers.add_parser(
"run", "run",
help="Run language model evaluation", help="Run language model evaluation",
description="Evaluate language models on various benchmarks and tasks.", description="Evaluate language models on various benchmarks and tasks.",
epilog=""" epilog=textwrap.dedent("""
Examples: examples:
lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag # Basic evaluation with HuggingFace model
lm-eval run --config my_config.yaml --tasks arc_easy,arc_challenge $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag
lm-eval run --model openai --tasks mmlu --num_fewshot 5
""", # Evaluate on multiple tasks with few-shot examples
$ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy,arc_challenge --num_fewshot 5
# Evaluation with custom generation parameters
$ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs "temperature=0.8,top_p=0.95"
# Use configuration file
$ lm-eval run --config my_config.yaml --tasks mmlu
For more information, see: https://github.com/EleutherAI/lm-evaluation-harness
"""),
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
) )
self._add_args()
self._parser.set_defaults(func=lambda args: self._parser.print_help())
# Add command-specific arguments def _add_args(self) -> None:
self._add_args(parser) self._parser = self._parser
self._parser.add_argument(
# Set the function to execute for this subcommand
parser.set_defaults(func=self.execute)
def _add_args(self, parser: argparse.ArgumentParser) -> None:
parser.add_argument(
"--config", "--config",
"-C", "-C",
default=None, default=None,
...@@ -46,14 +54,14 @@ Examples: ...@@ -46,14 +54,14 @@ Examples:
metavar="DIR/file.yaml", metavar="DIR/file.yaml",
help="Path to config with all arguments for `lm-eval`", help="Path to config with all arguments for `lm-eval`",
) )
parser.add_argument( self._parser.add_argument(
"--model", "--model",
"-m", "-m",
type=str, type=str,
default="hf", default="hf",
help="Name of model. Default 'hf'", help="Name of model. Default 'hf'",
) )
parser.add_argument( self._parser.add_argument(
"--tasks", "--tasks",
"-t", "-t",
default=None, default=None,
...@@ -61,14 +69,14 @@ Examples: ...@@ -61,14 +69,14 @@ Examples:
metavar="task1,task2", metavar="task1,task2",
help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above", help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above",
) )
parser.add_argument( self._parser.add_argument(
"--model_args", "--model_args",
"-a", "-a",
default=None, default=None,
type=try_parse_json, type=try_parse_json,
help="""Comma separated string or JSON formatted arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32` or '{"pretrained":"EleutherAI/pythia-160m","dtype":"float32"}'.""", help="""Comma separated string or JSON formatted arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32` or '{"pretrained":"EleutherAI/pythia-160m","dtype":"float32"}'.""",
) )
parser.add_argument( self._parser.add_argument(
"--num_fewshot", "--num_fewshot",
"-f", "-f",
type=int, type=int,
...@@ -76,7 +84,7 @@ Examples: ...@@ -76,7 +84,7 @@ Examples:
metavar="N", metavar="N",
help="Number of examples in few-shot context", help="Number of examples in few-shot context",
) )
parser.add_argument( self._parser.add_argument(
"--batch_size", "--batch_size",
"-b", "-b",
type=str, type=str,
...@@ -84,20 +92,20 @@ Examples: ...@@ -84,20 +92,20 @@ Examples:
metavar="auto|auto:N|N", metavar="auto|auto:N|N",
help="Acceptable values are 'auto', 'auto:N' (recompute batchsize N times with time) or N, where N is an integer. Default 1.", help="Acceptable values are 'auto', 'auto:N' (recompute batchsize N times with time) or N, where N is an integer. Default 1.",
) )
parser.add_argument( self._parser.add_argument(
"--max_batch_size", "--max_batch_size",
type=int, type=int,
default=None, default=None,
metavar="N", metavar="N",
help="Maximal batch size to try with --batch_size auto.", help="Maximal batch size to try with --batch_size auto.",
) )
parser.add_argument( self._parser.add_argument(
"--device", "--device",
type=str, type=str,
default=None, default=None,
help="Device to use (e.g. cuda, cuda:0, cpu). Model defaults. Default None.", help="Device to use (e.g. cuda, cuda:0, cpu). Model defaults. Default None.",
) )
parser.add_argument( self._parser.add_argument(
"--output_path", "--output_path",
"-o", "-o",
default=None, default=None,
...@@ -105,7 +113,7 @@ Examples: ...@@ -105,7 +113,7 @@ Examples:
metavar="DIR|DIR/file.json", metavar="DIR|DIR/file.json",
help="Path where result metrics will be saved. Can be either a directory or a .json file. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.", help="Path where result metrics will be saved. Can be either a directory or a .json file. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
) )
parser.add_argument( self._parser.add_argument(
"--limit", "--limit",
"-L", "-L",
type=float, type=float,
...@@ -114,7 +122,7 @@ Examples: ...@@ -114,7 +122,7 @@ Examples:
help="Limit the number of examples per task. " help="Limit the number of examples per task. "
"If <1, limit is a percentage of the total number of examples.", "If <1, limit is a percentage of the total number of examples.",
) )
parser.add_argument( self._parser.add_argument(
"--samples", "--samples",
"-E", "-E",
default=None, default=None,
...@@ -122,7 +130,7 @@ Examples: ...@@ -122,7 +130,7 @@ Examples:
metavar="/path/to/json", metavar="/path/to/json",
help='JSON string or path to JSON file containing doc indices of selected examples to test. Format: {"task_name":[indices],...}', help='JSON string or path to JSON file containing doc indices of selected examples to test. Format: {"task_name":[indices],...}',
) )
parser.add_argument( self._parser.add_argument(
"--use_cache", "--use_cache",
"-c", "-c",
type=str, type=str,
...@@ -130,40 +138,40 @@ Examples: ...@@ -130,40 +138,40 @@ Examples:
metavar="DIR", metavar="DIR",
help="A path to a sqlite db file for caching model responses. `None` if not caching.", help="A path to a sqlite db file for caching model responses. `None` if not caching.",
) )
parser.add_argument( self._parser.add_argument(
"--cache_requests", "--cache_requests",
type=request_caching_arg_to_dict, type=request_caching_arg_to_dict,
default=None, default=None,
choices=["true", "refresh", "delete"], choices=["true", "refresh", "delete"],
help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.", help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
) )
parser.add_argument( self._parser.add_argument(
"--check_integrity", "--check_integrity",
action="store_true", action="store_true",
default=argparse.SUPPRESS, default=argparse.SUPPRESS,
help="Whether to run the relevant part of the test suite for the tasks.", help="Whether to run the relevant part of the test suite for the tasks.",
) )
parser.add_argument( self._parser.add_argument(
"--write_out", "--write_out",
"-w", "-w",
action="store_true", action="store_true",
default=argparse.SUPPRESS, default=argparse.SUPPRESS,
help="Prints the prompt for the first few documents.", help="Prints the prompt for the first few documents.",
) )
parser.add_argument( self._parser.add_argument(
"--log_samples", "--log_samples",
"-s", "-s",
action="store_true", action="store_true",
default=argparse.SUPPRESS, default=argparse.SUPPRESS,
help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.", help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
) )
parser.add_argument( self._parser.add_argument(
"--system_instruction", "--system_instruction",
type=str, type=str,
default=None, default=None,
help="System instruction to be used in the prompt", help="System instruction to be used in the prompt",
) )
parser.add_argument( self._parser.add_argument(
"--apply_chat_template", "--apply_chat_template",
type=str, type=str,
nargs="?", nargs="?",
...@@ -176,26 +184,26 @@ Examples: ...@@ -176,26 +184,26 @@ Examples:
"E.g. `--apply_chat_template template_name`" "E.g. `--apply_chat_template template_name`"
), ),
) )
parser.add_argument( self._parser.add_argument(
"--fewshot_as_multiturn", "--fewshot_as_multiturn",
action="store_true", action="store_true",
default=argparse.SUPPRESS, default=argparse.SUPPRESS,
help="If True, uses the fewshot as a multi-turn conversation", help="If True, uses the fewshot as a multi-turn conversation",
) )
parser.add_argument( self._parser.add_argument(
"--show_config", "--show_config",
action="store_true", action="store_true",
default=argparse.SUPPRESS, default=argparse.SUPPRESS,
help="If True, shows the the full config of all tasks at the end of the evaluation.", help="If True, shows the the full config of all tasks at the end of the evaluation.",
) )
parser.add_argument( self._parser.add_argument(
"--include_path", "--include_path",
type=str, type=str,
default=None, default=None,
metavar="DIR", metavar="DIR",
help="Additional path to include if there are external tasks to include.", help="Additional path to include if there are external tasks to include.",
) )
parser.add_argument( self._parser.add_argument(
"--gen_kwargs", "--gen_kwargs",
type=try_parse_json, type=try_parse_json,
default=None, default=None,
...@@ -204,7 +212,7 @@ Examples: ...@@ -204,7 +212,7 @@ Examples:
""" e.g. '{"do_sample": True, temperature":0.7,"until":["hello"]}' or temperature=0,top_p=0.1.""" """ e.g. '{"do_sample": True, temperature":0.7,"until":["hello"]}' or temperature=0,top_p=0.1."""
), ),
) )
parser.add_argument( self._parser.add_argument(
"--verbosity", "--verbosity",
"-v", "-v",
type=str.upper, type=str.upper,
...@@ -212,25 +220,25 @@ Examples: ...@@ -212,25 +220,25 @@ Examples:
metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG", metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
help="(Deprecated) Controls logging verbosity level. Use the `LOGLEVEL` environment variable instead. Set to DEBUG for detailed output when testing or adding new task configurations.", help="(Deprecated) Controls logging verbosity level. Use the `LOGLEVEL` environment variable instead. Set to DEBUG for detailed output when testing or adding new task configurations.",
) )
parser.add_argument( self._parser.add_argument(
"--wandb_args", "--wandb_args",
type=str, type=str,
default=argparse.SUPPRESS, default=argparse.SUPPRESS,
help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval`", help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval`",
) )
parser.add_argument( self._parser.add_argument(
"--wandb_config_args", "--wandb_config_args",
type=str, type=str,
default=argparse.SUPPRESS, default=argparse.SUPPRESS,
help="Comma separated string arguments passed to wandb.config.update. Use this to trace parameters that aren't already traced by default. eg. `lr=0.01,repeats=3`", help="Comma separated string arguments passed to wandb.config.update. Use this to trace parameters that aren't already traced by default. eg. `lr=0.01,repeats=3`",
) )
parser.add_argument( self._parser.add_argument(
"--hf_hub_log_args", "--hf_hub_log_args",
type=str, type=str,
default=argparse.SUPPRESS, default=argparse.SUPPRESS,
help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`", help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
) )
parser.add_argument( self._parser.add_argument(
"--predict_only", "--predict_only",
"-x", "-x",
action="store_true", action="store_true",
...@@ -238,7 +246,7 @@ Examples: ...@@ -238,7 +246,7 @@ Examples:
help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.", help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
) )
default_seed_string = "0,1234,1234,1234" default_seed_string = "0,1234,1234,1234"
parser.add_argument( self._parser.add_argument(
"--seed", "--seed",
type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string), type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
default=default_seed_string, # for backward compatibility default=default_seed_string, # for backward compatibility
...@@ -253,19 +261,19 @@ Examples: ...@@ -253,19 +261,19 @@ Examples:
"E.g, `--seed 42` sets all four seeds to 42." "E.g, `--seed 42` sets all four seeds to 42."
), ),
) )
parser.add_argument( self._parser.add_argument(
"--trust_remote_code", "--trust_remote_code",
action="store_true", action="store_true",
default=argparse.SUPPRESS, default=argparse.SUPPRESS,
help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub", help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
) )
parser.add_argument( self._parser.add_argument(
"--confirm_run_unsafe_code", "--confirm_run_unsafe_code",
action="store_true", action="store_true",
default=argparse.SUPPRESS, default=argparse.SUPPRESS,
help="Confirm that you understand the risks of running unsafe code for tasks that require it", help="Confirm that you understand the risks of running unsafe code for tasks that require it",
) )
parser.add_argument( self._parser.add_argument(
"--metadata", "--metadata",
type=json.loads, type=json.loads,
default=None, default=None,
......
import argparse import argparse
import sys import sys
import textwrap
from lm_eval._cli.base import SubCommand from lm_eval._cli.base import SubCommand
...@@ -8,29 +9,73 @@ class ValidateCommand(SubCommand): ...@@ -8,29 +9,73 @@ class ValidateCommand(SubCommand):
"""Command for validating tasks.""" """Command for validating tasks."""
def __init__(self, subparsers: argparse._SubParsersAction, *args, **kwargs): def __init__(self, subparsers: argparse._SubParsersAction, *args, **kwargs):
# Create and configure the parser # Create and configure the self._parser
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
parser = subparsers.add_parser( self._parser = subparsers.add_parser(
"validate", "validate",
help="Validate task configurations", help="Validate task configurations",
description="Validate task configurations and check for errors.", description="Validate task configurations and check for errors.",
epilog=""" epilog=textwrap.dedent("""
Examples: examples:
lm-eval validate --tasks hellaswag # Validate single task # Validate a single task
lm-eval validate --tasks arc_easy,arc_challenge # Validate multiple tasks lm-eval validate --tasks hellaswag
lm-eval validate --tasks mmlu --include_path ./custom_tasks
""", # Validate multiple tasks
lm-eval validate --tasks arc_easy,arc_challenge,hellaswag
# Validate a task group
lm-eval validate --tasks mmlu
# Validate tasks with external definitions
lm-eval validate --tasks my_custom_task --include_path ./custom_tasks
# Validate tasks from multiple external paths
lm-eval validate --tasks custom_task1,custom_task2 --include_path "/path/to/tasks1:/path/to/tasks2"
validation check:
The validate command performs several checks:
• Task existence: Verifies all specified tasks are available
• Configuration syntax: Checks YAML/JSON configuration files
• Dataset access: Validates dataset paths and configurations
• Required fields: Ensures all mandatory task parameters are present
• Metric definitions: Verifies metric functions and aggregation methods
• Filter pipelines: Validates filter chains and their parameters
• Template rendering: Tests prompt templates with sample data
task config files:
Tasks are defined using YAML configuration files with these key sections:
• task: Task name and metadata
• dataset_path: HuggingFace dataset identifier
• doc_to_text: Template for converting documents to prompts
• doc_to_target: Template for extracting target answers
• metric_list: List of evaluation metrics to compute
• output_type: Type of model output (loglikelihood, generate_until, etc.)
• filter_list: Post-processing filters for model outputs
common errors:
• Missing required fields in YAML configuration
• Invalid dataset paths or missing dataset splits
• Malformed Jinja2 templates in doc_to_text/doc_to_target
• Undefined metrics or aggregation functions
• Invalid filter names or parameters
• Circular dependencies in task inheritance
• Missing external task files when using --include_path
debugging tips:
• Use --include_path to test external task definitions
• Check task configuration files for syntax errors
• Verify dataset access and authentication if needed
• Use 'lm-eval list tasks' to see available tasks
For task configuration guide, see: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md
"""),
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
) )
self._add_args()
self._parser.set_defaults(func=lambda args: self._parser.print_help())
# Add command-specific arguments def _add_args(self) -> None:
self._add_args(parser) self._parser.add_argument(
# Set the function to execute for this subcommand
parser.set_defaults(func=self.execute)
def _add_args(self, parser: argparse.ArgumentParser) -> None:
parser.add_argument(
"--tasks", "--tasks",
"-t", "-t",
required=True, required=True,
...@@ -38,7 +83,7 @@ Examples: ...@@ -38,7 +83,7 @@ Examples:
metavar="task1,task2", metavar="task1,task2",
help="Comma-separated list of task names to validate", help="Comma-separated list of task names to validate",
) )
parser.add_argument( self._parser.add_argument(
"--include_path", "--include_path",
type=str, type=str,
default=None, default=None,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment