help="Path to config with all arguments for `lm-eval`",
help="Set initial arguments from YAML config",
)
)
self._parser.add_argument(
# Model and Tasks
model_group=self._parser.add_argument_group("model and tasks")
model_group.add_argument(
"--model",
"--model",
"-m",
"-m",
type=str,
type=str,
default="hf",
default="hf",
help="Name of model. Default 'hf'",
metavar="MODEL_NAME",
help="Model name (default: hf)",
)
)
self._parser.add_argument(
model_group.add_argument(
"--tasks",
"--tasks",
"-t",
"-t",
default=None,
default=None,
type=str,
type=str,
metavar="task1,task2",
metavar="TASK1,TASK2",
help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above",
help=textwrap.dedent("""
Comma-separated list of task names or groupings.
Use 'lm-eval list tasks' to see all available tasks.
""").strip(),
)
)
self._parser.add_argument(
model_group.add_argument(
"--model_args",
"--model_args",
"-a",
"-a",
default=None,
default=None,
type=try_parse_json,
type=try_parse_json,
help="""Comma separated string or JSON formatted arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32` or '{"pretrained":"EleutherAI/pythia-160m","dtype":"float32"}'.""",
metavar="ARGS",
help="Model arguments as 'key=val,key2=val2' or JSON string",
help="Acceptable values are 'auto', 'auto:N' (recompute batchsize N times with time) or N, where N is an integer. Default 1.",
help=textwrap.dedent(
"Batch size: 'auto', 'auto:N' (auto-tune N times), or integer (default: 1)"
),
)
)
self._parser.add_argument(
eval_group.add_argument(
"--max_batch_size",
"--max_batch_size",
type=int,
type=int,
default=None,
default=None,
metavar="N",
metavar="N",
help="Maximal batch size to try with --batch_size auto.",
help="Maximum batch size when using --batch_size auto",
)
)
self._parser.add_argument(
eval_group.add_argument(
"--device",
"--device",
type=str,
type=str,
default=None,
default=None,
help="Device to use (e.g. cuda, cuda:0, cpu). Model defaults. Default None.",
metavar="DEVICE",
help="Device to use (e.g. cuda, cuda:0, cpu, mps)",
)
)
self._parser.add_argument(
eval_group.add_argument(
"--gen_kwargs",
type=try_parse_json,
default=None,
metavar="KWARGS",
help="Generation arguments as 'key=val,key2=val2' or JSON string",
)
# Data and Output
data_group=self._parser.add_argument_group("data and output")
data_group.add_argument(
"--output_path",
"--output_path",
"-o",
"-o",
default=None,
default=None,
type=str,
type=str,
metavar="DIR|DIR/file.json",
metavar="OUTPUT_PATH",
help="Path where result metrics will be saved. Can be either a directory or a .json file. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
help="Output dir or json file for results (and samples)",
)
data_group.add_argument(
"--log_samples",
"-s",
action="store_true",
default=argparse.SUPPRESS,
help="Save all model outputs and documents for post-hoc analysis",
)
)
self._parser.add_argument(
data_group.add_argument(
"--limit",
"--limit",
"-L",
"-L",
type=float,
type=float,
default=None,
default=None,
metavar="N|0<N<1",
metavar="N|0.0-1.0",
help="Limit the number of examples per task. "
help="Limit examples per task (integer count or fraction)",
"If <1, limit is a percentage of the total number of examples.",
)
)
self._parser.add_argument(
data_group.add_argument(
"--samples",
"--samples",
"-E",
"-E",
default=None,
default=None,
type=try_parse_json,
type=try_parse_json,
metavar="/path/to/json",
metavar="JSON_FILE",
help='JSON string or path to JSON file containing doc indices of selected examples to test. Format: {"task_name":[indices],...}',
help=textwrap.dedent(
'JSON file with specific sample indices for inputs: {"task_name":[indices],...}. Incompatible with --limit.'
),
)
)
self._parser.add_argument(
# Caching and Performance
cache_group=self._parser.add_argument_group("caching and performance")
cache_group.add_argument(
"--use_cache",
"--use_cache",
"-c",
"-c",
type=str,
type=str,
default=None,
default=None,
metavar="DIR",
metavar="CACHE_DIR",
help="A path to a sqlite db file for caching model responses. `None` if not caching.",
help="SQLite database path for caching model outputs.",
)
)
self._parser.add_argument(
cache_group.add_argument(
"--cache_requests",
"--cache_requests",
type=request_caching_arg_to_dict,
type=request_caching_arg_to_dict,
default=None,
default=None,
choices=["true","refresh","delete"],
choices=["true","refresh","delete"],
help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
help="Cache dataset request building (true|refresh|delete)",
)
)
self._parser.add_argument(
cache_group.add_argument(
"--check_integrity",
"--check_integrity",
action="store_true",
action="store_true",
default=argparse.SUPPRESS,
default=argparse.SUPPRESS,
help="Whether to run the relevant part of the test suite for the tasks.",
help="Run task test suite validation",
)
self._parser.add_argument(
"--write_out",
"-w",
action="store_true",
default=argparse.SUPPRESS,
help="Prints the prompt for the first few documents.",
)
self._parser.add_argument(
"--log_samples",
"-s",
action="store_true",
default=argparse.SUPPRESS,
help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
help="If True, shows the the full config of all tasks at the end of the evaluation.",
)
self._parser.add_argument(
"--include_path",
"--include_path",
type=str,
type=str,
default=None,
default=None,
metavar="DIR",
metavar="TASK_DIR",
help="Additional path to include if there are external tasks to include.",
help="Additional directory for external tasks",
)
)
self._parser.add_argument(
"--gen_kwargs",
# Logging and Tracking
type=try_parse_json,
logging_group=self._parser.add_argument_group("logging and tracking")
default=None,
logging_group.add_argument(
help=(
"Either comma delimited string or JSON formatted arguments for model generation on greedy_until tasks,"
""" e.g. '{"do_sample": True, temperature":0.7,"until":["hello"]}' or temperature=0,top_p=0.1."""
),
)
self._parser.add_argument(
"--verbosity",
"--verbosity",
"-v",
"-v",
type=str.upper,
type=str.upper,
default=None,
default=None,
metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
metavar="LEVEL",
help="(Deprecated) Controls logging verbosity level. Use the `LOGLEVEL` environment variable instead. Set to DEBUG for detailed output when testing or adding new task configurations.",
help="(Deprecated) Log level. Use LOGLEVEL env var instead",
)
)
self._parser.add_argument(
logging_group.add_argument(
"--write_out",
"-w",
action="store_true",
default=argparse.SUPPRESS,
help="Print prompts for first few documents",
)
logging_group.add_argument(
"--show_config",
action="store_true",
default=argparse.SUPPRESS,
help="Display full task configuration after evaluation",
)
logging_group.add_argument(
"--wandb_args",
"--wandb_args",
type=str,
type=str,
default=argparse.SUPPRESS,
default=argparse.SUPPRESS,
help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval`",
help="Comma separated string arguments passed to wandb.config.update. Use this to trace parameters that aren't already traced by default. eg. `lr=0.01,repeats=3`",
default=default_seed_string,# for backward compatibility
default=default_seed_string,
help=(
metavar="SEED|S1,S2,S3,S4",
"Set seed for python's random, numpy, torch, and fewshot sampling.\n"
help=textwrap.dedent(f"""
"Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
Random seeds for python,numpy,torch,fewshot (default: {default_seed_string}).
"respectively, or a single integer to set the same seed for all four.\n"
Use single integer for all, or comma-separated list of 4 values.
f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
Use 'None' to skip setting a seed. Example: --seed 42 or --seed 0,None,8,52
"(for backward compatibility).\n"
""").strip(),
"E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
)
"Here numpy's seed is not set since the second value is `None`.\n"
advanced_group.add_argument(
"E.g, `--seed 42` sets all four seeds to 42."
),
)
self._parser.add_argument(
"--trust_remote_code",
"--trust_remote_code",
action="store_true",
action="store_true",
default=argparse.SUPPRESS,
default=argparse.SUPPRESS,
help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
help="Allow executing remote code from Hugging Face Hub",
)
)
self._parser.add_argument(
advanced_group.add_argument(
"--confirm_run_unsafe_code",
"--confirm_run_unsafe_code",
action="store_true",
action="store_true",
default=argparse.SUPPRESS,
default=argparse.SUPPRESS,
help="Confirm that you understand the risks of running unsafe code for tasks that require it",
help="Confirm understanding of unsafe code execution risks",
)
)
self._parser.add_argument(
advanced_group.add_argument(
"--metadata",
"--metadata",
type=json.loads,
type=json.loads,
default=None,
default=None,
help="""JSON string metadata to pass to task configs, for example '{"max_seq_lengths":[4096,8192]}'. Will be merged with model_args. Can also be set in task config.""",
metavar="JSON",
help=textwrap.dedent(
"JSON metadata for task configs (merged with model_args), required for some tasks such as RULER"