Commit 91e49e23 authored by Baber's avatar Baber
Browse files

fix: update default values and improve help text in configuration files

parent 84d02f77
...@@ -24,7 +24,7 @@ class Run(SubCommand): ...@@ -24,7 +24,7 @@ class Run(SubCommand):
"run", "run",
help="Run the evaluation harness on specified tasks", help="Run the evaluation harness on specified tasks",
description="Evaluate language models on various benchmarks and tasks.", description="Evaluate language models on various benchmarks and tasks.",
usage="lm-eval run --model <model> --tasks <task1,task2,...> [options]", usage="lm-eval run --model <model> --tasks <task> <task> --model_args <arg=value> <arg=value> [options]",
epilog=textwrap.dedent(""" epilog=textwrap.dedent("""
examples: examples:
# Basic evaluation with HuggingFace model # Basic evaluation with HuggingFace model
...@@ -34,7 +34,7 @@ class Run(SubCommand): ...@@ -34,7 +34,7 @@ class Run(SubCommand):
$ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy arc_challenge --num_fewshot 5 $ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy arc_challenge --num_fewshot 5
# Evaluation with custom generation parameters # Evaluation with custom generation parameters
$ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8 top_p=0.95 $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8 top_p=0.95 'stop=["\\n\\n"]'
# Use configuration file # Use configuration file
$ lm-eval run --config my_config.yaml --tasks mmlu $ lm-eval run --config my_config.yaml --tasks mmlu
...@@ -133,7 +133,8 @@ class Run(SubCommand): ...@@ -133,7 +133,8 @@ class Run(SubCommand):
nargs="*", nargs="*",
metavar="KWARGS", metavar="KWARGS",
help=textwrap.dedent( help=textwrap.dedent(
'Generation arguments as `temperature=0,stop=["stop"]` or `key=val` `key2=val2`. Values should be parsable with ast.literal_eval.' 'Generation arguments as `temperature=0,stop=["stop"]` or `key=val` `key2=val2`.'
"Values should be parsable with ast.literal_eval."
), ),
) )
...@@ -167,9 +168,10 @@ class Run(SubCommand): ...@@ -167,9 +168,10 @@ class Run(SubCommand):
"-E", "-E",
default=None, default=None,
type=try_parse_json, type=try_parse_json,
metavar="JSON_FILE", metavar='"task1": [1,2,3,4,...]"',
help=textwrap.dedent( help=textwrap.dedent(
'JSON file with specific sample indices for inputs: {"task_name":[indices],...}. Incompatible with --limit.' "`...` `...` Sample indices for inputs. Incompatible with --limit."
" Values be parsable with ast.literal_eval."
), ),
) )
...@@ -314,9 +316,10 @@ class Run(SubCommand): ...@@ -314,9 +316,10 @@ class Run(SubCommand):
"--metadata", "--metadata",
type=json.loads, type=json.loads,
default=None, default=None,
metavar="JSON", metavar="`key=val` `key2=val2`",
help=textwrap.dedent( help=textwrap.dedent(
"""JSON metadata for task configs (merged with model_args), required for some tasks such as RULER""" """`key=val` `key2=val` args parsable by ast.literal_eval (merged with model_args),
required for some tasks such as RULER"""
), ),
) )
......
...@@ -21,6 +21,7 @@ DICT_KEYS = [ ...@@ -21,6 +21,7 @@ DICT_KEYS = [
"hf_hub_log_args", "hf_hub_log_args",
"metadata", "metadata",
"model_args", "model_args",
"gen_kwargs",
] ]
...@@ -79,7 +80,7 @@ class EvaluatorConfig: ...@@ -79,7 +80,7 @@ class EvaluatorConfig:
# Device # Device
device: Optional[str] = field( device: Optional[str] = field(
default=None, metadata={"help": "Device to use (e.g. cuda, cuda:0, cpu)"} default="cuda:0", metadata={"help": "Device to use (e.g. cuda, cuda:0, cpu)"}
) )
# Data sampling and limiting # Data sampling and limiting
...@@ -126,7 +127,10 @@ class EvaluatorConfig: ...@@ -126,7 +127,10 @@ class EvaluatorConfig:
default=None, metadata={"help": "Custom System instruction to add"} default=None, metadata={"help": "Custom System instruction to add"}
) )
apply_chat_template: Union[bool, str] = field( apply_chat_template: Union[bool, str] = field(
default=False, metadata={"help": "Apply chat template to prompt"} default=False,
metadata={
"help": "Apply chat template to prompt. Either True, or a string identifying the tokenizer template."
},
) )
fewshot_as_multiturn: bool = field( fewshot_as_multiturn: bool = field(
default=False, default=False,
...@@ -170,7 +174,7 @@ class EvaluatorConfig: ...@@ -170,7 +174,7 @@ class EvaluatorConfig:
metadata={"help": "Seeds for random, numpy, torch, fewshot (random)"}, metadata={"help": "Seeds for random, numpy, torch, fewshot (random)"},
) )
# Security and safety # Security
trust_remote_code: bool = field( trust_remote_code: bool = field(
default=False, metadata={"help": "Trust remote code for HF datasets"} default=False, metadata={"help": "Trust remote code for HF datasets"}
) )
...@@ -201,7 +205,7 @@ class EvaluatorConfig: ...@@ -201,7 +205,7 @@ class EvaluatorConfig:
config.update(cls.load_yaml_config(namespace.config)) config.update(cls.load_yaml_config(namespace.config))
# Override with CLI args (only truthy values, exclude non-config args) # Override with CLI args (only truthy values, exclude non-config args)
excluded_args = {"config", "command", "func"} # argparse internal args excluded_args = {"command", "func"} # argparse internal args
cli_args = { cli_args = {
k: v for k, v in vars(namespace).items() if v and k not in excluded_args k: v for k, v in vars(namespace).items() if v and k not in excluded_args
} }
...@@ -252,7 +256,6 @@ class EvaluatorConfig: ...@@ -252,7 +256,6 @@ class EvaluatorConfig:
try: try:
yaml_data = yaml.safe_load(config_file.read_text()) yaml_data = yaml.safe_load(config_file.read_text())
print(textwrap.dedent(f"""yaml: {yaml_data}"""))
except yaml.YAMLError as e: except yaml.YAMLError as e:
raise ValueError(f"Invalid YAML in {config_path}: {e}") raise ValueError(f"Invalid YAML in {config_path}: {e}")
except (OSError, UnicodeDecodeError) as e: except (OSError, UnicodeDecodeError) as e:
...@@ -337,17 +340,10 @@ class EvaluatorConfig: ...@@ -337,17 +340,10 @@ class EvaluatorConfig:
metadata=self.metadata if self.metadata else {}, metadata=self.metadata if self.metadata else {},
) )
# self.tasks is a comma-separated string of task names task_names = task_manager.match_tasks(self.tasks)
if isinstance((task_list := self.tasks), str):
task_list = self.tasks.split(",")
else:
assert isinstance(self.tasks, list), (
"`tasks` must be a comma delimited string of task names or list[str]."
)
task_names = task_manager.match_tasks(task_list)
# Check for any individual task files in the list # Check for any individual task files in the list
for task in [task for task in task_list if task not in task_names]: for task in [task for task in self.tasks if task not in task_names]:
task_path = Path(task) task_path = Path(task)
if task_path.is_file(): if task_path.is_file():
config = utils.load_yaml_config(str(task_path)) config = utils.load_yaml_config(str(task_path))
...@@ -355,7 +351,7 @@ class EvaluatorConfig: ...@@ -355,7 +351,7 @@ class EvaluatorConfig:
# Check for missing tasks # Check for missing tasks
task_missing = [ task_missing = [
task for task in task_list if task not in task_names and "*" not in task task for task in self.tasks if task not in task_names and "*" not in task
] ]
if task_missing: if task_missing:
......
...@@ -467,7 +467,9 @@ def evaluate( ...@@ -467,7 +467,9 @@ def evaluate(
"Either 'limit' or 'samples' must be None, but both are not None." "Either 'limit' or 'samples' must be None, but both are not None."
) )
if samples is not None: if samples is not None:
eval_logger.info(f"Evaluating examples for tasks {list(samples.keys())}") eval_logger.info(
f"Evaluating examples for tasks {[x for x in list(samples.keys()) if x in task_dict.keys()]}"
)
if apply_chat_template: if apply_chat_template:
eval_logger.warning( eval_logger.warning(
"Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details." "Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
......
...@@ -6,9 +6,10 @@ ...@@ -6,9 +6,10 @@
# Usage: # Usage:
# $ lm_eval --config templates/example_ci_config.yaml # $ lm_eval --config templates/example_ci_config.yaml
# #
# You can override any values in this config with command-line arguments: # You can override any values in this config with further command-line arguments:
# $ lm_eval --config templates/example_ci_config.yaml --model_args pretrained=gpt2 --tasks mmlu # $ lm_eval --config templates/example_ci_config.yaml --model_args pretrained=gpt2 --tasks mmlu
# #
# For expected types and values, refer to EvaluatorConfig in lm_eval/config/evaluate_config.py
# All parameters are optional and have the same meaning as their CLI counterparts. # All parameters are optional and have the same meaning as their CLI counterparts.
model: hf model: hf
...@@ -19,13 +20,13 @@ tasks: ...@@ -19,13 +20,13 @@ tasks:
- hellaswag - hellaswag
- arc_easy - arc_easy
batch_size: 1 batch_size: 1
device: mps
trust_remote_code: true trust_remote_code: true
log_samples: true log_samples: true
output_path: ./test output_path: ./test
gen_kwargs: gen_kwargs:
do_sample: true do_sample: true
temperature: 0.7 temperature: 0.7
stop: ["\n", "<|endoftext|>"]
samples: samples:
hellaswag: [1,2,3,4,5,6,7,8,9,10] hellaswag: [1,2,3,4,5,6,7,8,9,10]
arc_easy: [10,20,30,40,50,60,70,80,90,100] arc_easy: [10,20,30,40,50,60,70,80,90,100]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment