Commit 91e49e23 authored by Baber's avatar Baber
Browse files

fix: update default values and improve help text in configuration files

parent 84d02f77
......@@ -24,7 +24,7 @@ class Run(SubCommand):
"run",
help="Run the evaluation harness on specified tasks",
description="Evaluate language models on various benchmarks and tasks.",
usage="lm-eval run --model <model> --tasks <task1,task2,...> [options]",
usage="lm-eval run --model <model> --tasks <task> <task> --model_args <arg=value> <arg=value> [options]",
epilog=textwrap.dedent("""
examples:
# Basic evaluation with HuggingFace model
......@@ -34,7 +34,7 @@ class Run(SubCommand):
$ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy arc_challenge --num_fewshot 5
# Evaluation with custom generation parameters
$ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8 top_p=0.95
$ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8 top_p=0.95 'stop=["\\n\\n"]'
# Use configuration file
$ lm-eval run --config my_config.yaml --tasks mmlu
......@@ -133,7 +133,8 @@ class Run(SubCommand):
nargs="*",
metavar="KWARGS",
help=textwrap.dedent(
'Generation arguments as `temperature=0,stop=["stop"]` or `key=val` `key2=val2`. Values should be parsable with ast.literal_eval.'
'Generation arguments as `temperature=0,stop=["stop"]` or `key=val` `key2=val2`.'
"Values should be parsable with ast.literal_eval."
),
)
......@@ -167,9 +168,10 @@ class Run(SubCommand):
"-E",
default=None,
type=try_parse_json,
metavar="JSON_FILE",
metavar='"task1": [1,2,3,4,...]"',
help=textwrap.dedent(
'JSON file with specific sample indices for inputs: {"task_name":[indices],...}. Incompatible with --limit.'
"`...` `...` Sample indices for inputs. Incompatible with --limit."
" Values be parsable with ast.literal_eval."
),
)
......@@ -314,9 +316,10 @@ class Run(SubCommand):
"--metadata",
type=json.loads,
default=None,
metavar="JSON",
metavar="`key=val` `key2=val2`",
help=textwrap.dedent(
"""JSON metadata for task configs (merged with model_args), required for some tasks such as RULER"""
"""`key=val` `key2=val` args parsable by ast.literal_eval (merged with model_args),
required for some tasks such as RULER"""
),
)
......
......@@ -21,6 +21,7 @@ DICT_KEYS = [
"hf_hub_log_args",
"metadata",
"model_args",
"gen_kwargs",
]
......@@ -79,7 +80,7 @@ class EvaluatorConfig:
# Device
device: Optional[str] = field(
default=None, metadata={"help": "Device to use (e.g. cuda, cuda:0, cpu)"}
default="cuda:0", metadata={"help": "Device to use (e.g. cuda, cuda:0, cpu)"}
)
# Data sampling and limiting
......@@ -126,7 +127,10 @@ class EvaluatorConfig:
default=None, metadata={"help": "Custom System instruction to add"}
)
apply_chat_template: Union[bool, str] = field(
default=False, metadata={"help": "Apply chat template to prompt"}
default=False,
metadata={
"help": "Apply chat template to prompt. Either True, or a string identifying the tokenizer template."
},
)
fewshot_as_multiturn: bool = field(
default=False,
......@@ -170,7 +174,7 @@ class EvaluatorConfig:
metadata={"help": "Seeds for random, numpy, torch, fewshot (random)"},
)
# Security and safety
# Security
trust_remote_code: bool = field(
default=False, metadata={"help": "Trust remote code for HF datasets"}
)
......@@ -201,7 +205,7 @@ class EvaluatorConfig:
config.update(cls.load_yaml_config(namespace.config))
# Override with CLI args (only truthy values, exclude non-config args)
excluded_args = {"config", "command", "func"} # argparse internal args
excluded_args = {"command", "func"} # argparse internal args
cli_args = {
k: v for k, v in vars(namespace).items() if v and k not in excluded_args
}
......@@ -252,7 +256,6 @@ class EvaluatorConfig:
try:
yaml_data = yaml.safe_load(config_file.read_text())
print(textwrap.dedent(f"""yaml: {yaml_data}"""))
except yaml.YAMLError as e:
raise ValueError(f"Invalid YAML in {config_path}: {e}")
except (OSError, UnicodeDecodeError) as e:
......@@ -337,17 +340,10 @@ class EvaluatorConfig:
metadata=self.metadata if self.metadata else {},
)
# self.tasks is a comma-separated string of task names
if isinstance((task_list := self.tasks), str):
task_list = self.tasks.split(",")
else:
assert isinstance(self.tasks, list), (
"`tasks` must be a comma delimited string of task names or list[str]."
)
task_names = task_manager.match_tasks(task_list)
task_names = task_manager.match_tasks(self.tasks)
# Check for any individual task files in the list
for task in [task for task in task_list if task not in task_names]:
for task in [task for task in self.tasks if task not in task_names]:
task_path = Path(task)
if task_path.is_file():
config = utils.load_yaml_config(str(task_path))
......@@ -355,7 +351,7 @@ class EvaluatorConfig:
# Check for missing tasks
task_missing = [
task for task in task_list if task not in task_names and "*" not in task
task for task in self.tasks if task not in task_names and "*" not in task
]
if task_missing:
......
......@@ -467,7 +467,9 @@ def evaluate(
"Either 'limit' or 'samples' must be None, but both are not None."
)
if samples is not None:
eval_logger.info(f"Evaluating examples for tasks {list(samples.keys())}")
eval_logger.info(
f"Evaluating examples for tasks {[x for x in list(samples.keys()) if x in task_dict.keys()]}"
)
if apply_chat_template:
eval_logger.warning(
"Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
......
......@@ -6,9 +6,10 @@
# Usage:
# $ lm_eval --config templates/example_ci_config.yaml
#
# You can override any values in this config with command-line arguments:
# You can override any values in this config with further command-line arguments:
# $ lm_eval --config templates/example_ci_config.yaml --model_args pretrained=gpt2 --tasks mmlu
#
# For expected types and values, refer to EvaluatorConfig in lm_eval/config/evaluate_config.py
# All parameters are optional and have the same meaning as their CLI counterparts.
model: hf
......@@ -19,13 +20,13 @@ tasks:
- hellaswag
- arc_easy
batch_size: 1
device: mps
trust_remote_code: true
log_samples: true
output_path: ./test
gen_kwargs:
do_sample: true
temperature: 0.7
stop: ["\n", "<|endoftext|>"]
samples:
hellaswag: [1,2,3,4,5,6,7,8,9,10]
arc_easy: [10,20,30,40,50,60,70,80,90,100]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment