help="""Comma separated string or JSON formatted arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32` or '{"pretrained":"EleutherAI/pythia-160m","dtype":"float32"}'.""",
)
...
...
@@ -77,9 +80,9 @@ Examples:
"--batch_size",
"-b",
type=str,
default=1,
default=argparse.SUPPRESS,
metavar="auto|auto:N|N",
help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
help="Acceptable values are 'auto', 'auto:N' (recompute batchsize N times with time) or N, where N is an integer. Default 1.",
)
parser.add_argument(
"--max_batch_size",
...
...
@@ -92,7 +95,7 @@ Examples:
"--device",
type=str,
default=None,
help="Device to use (e.g. cuda, cuda:0, cpu).",
help="Device to use (e.g. cuda, cuda:0, cpu). Model defaults. Default None.",
)
parser.add_argument(
"--output_path",
...
...
@@ -115,7 +118,7 @@ Examples:
"--samples",
"-E",
default=None,
type=str,
type=try_parse_json,
metavar="/path/to/json",
help='JSON string or path to JSON file containing doc indices of selected examples to test. Format: {"task_name":[indices],...}',
)
...
...
@@ -129,7 +132,7 @@ Examples:
)
parser.add_argument(
"--cache_requests",
type=str,
type=request_caching_arg_to_dict,
default=None,
choices=["true","refresh","delete"],
help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
...
...
@@ -137,20 +140,21 @@ Examples:
parser.add_argument(
"--check_integrity",
action="store_true",
default=argparse.SUPPRESS,
help="Whether to run the relevant part of the test suite for the tasks.",
)
parser.add_argument(
"--write_out",
"-w",
action="store_true",
default=False,
default=argparse.SUPPRESS,
help="Prints the prompt for the first few documents.",
)
parser.add_argument(
"--log_samples",
"-s",
action="store_true",
default=False,
default=argparse.SUPPRESS,
help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
)
parser.add_argument(
...
...
@@ -164,7 +168,7 @@ Examples:
type=str,
nargs="?",
const=True,
default=False,
default=argparse.SUPPRESS,
help=(
"If True, apply chat template to the prompt. "
"Providing `--apply_chat_template` without an argument will apply the default chat template to the prompt. "
...
...
@@ -175,13 +179,13 @@ Examples:
parser.add_argument(
"--fewshot_as_multiturn",
action="store_true",
default=False,
default=argparse.SUPPRESS,
help="If True, uses the fewshot as a multi-turn conversation",
)
parser.add_argument(
"--show_config",
action="store_true",
default=False,
default=argparse.SUPPRESS,
help="If True, shows the the full config of all tasks at the end of the evaluation.",
)
parser.add_argument(
...
...
@@ -197,7 +201,7 @@ Examples:
default=None,
help=(
"Either comma delimited string or JSON formatted arguments for model generation on greedy_until tasks,"
""" e.g. '{"temperature":0.7,"until":["hello"]}' or temperature=0,top_p=0.1."""
""" e.g. '{"do_sample": True, temperature":0.7,"until":["hello"]}' or temperature=0,top_p=0.1."""
),
)
parser.add_argument(
...
...
@@ -211,26 +215,26 @@ Examples:
parser.add_argument(
"--wandb_args",
type=str,
default="",
default=argparse.SUPPRESS,
help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval`",
)
parser.add_argument(
"--wandb_config_args",
type=str,
default="",
default=argparse.SUPPRESS,
help="Comma separated string arguments passed to wandb.config.update. Use this to trace parameters that aren't already traced by default. eg. `lr=0.01,repeats=3`",
)
parser.add_argument(
"--hf_hub_log_args",
type=str,
default="",
default=argparse.SUPPRESS,
help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
)
parser.add_argument(
"--predict_only",
"-x",
action="store_true",
default=False,
default=argparse.SUPPRESS,
help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
)
default_seed_string="0,1234,1234,1234"
...
...
@@ -252,11 +256,13 @@ Examples:
parser.add_argument(
"--trust_remote_code",
action="store_true",
default=argparse.SUPPRESS,
help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
)
parser.add_argument(
"--confirm_run_unsafe_code",
action="store_true",
default=argparse.SUPPRESS,
help="Confirm that you understand the risks of running unsafe code for tasks that require it",
)
parser.add_argument(
...
...
@@ -268,16 +274,13 @@ Examples:
defexecute(self,args:argparse.Namespace)->None:
"""Execute the evaluation command."""
# Import here to avoid circular imports and for faster CLI loading
fromlm_eval.api.eval_configimportEvaluationConfig
# Create and validate config (validation now happens in EvaluationConfig)