Commit b89af51e authored by Baber's avatar Baber
Browse files

update default values; fixes

parent fadd26e4
...@@ -29,12 +29,11 @@ repos: ...@@ -29,12 +29,11 @@ repos:
- id: mixed-line-ending - id: mixed-line-ending
args: [--fix=lf] args: [--fix=lf]
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.12.2 rev: v0.12.5
hooks: hooks:
# Run the linter. # Run the linter.
- id: ruff-check - id: ruff-check
args: [ --fix] args: [--fix]
# Run the formatter.
- id: ruff-format - id: ruff-format
- repo: https://github.com/codespell-project/codespell - repo: https://github.com/codespell-project/codespell
rev: v2.4.1 rev: v2.4.1
......
from rich.traceback import install
from lm_eval._cli.harness import HarnessCLI from lm_eval._cli.harness import HarnessCLI
from lm_eval.utils import setup_logging from lm_eval.utils import setup_logging
install(show_locals=True)
def cli_evaluate() -> None: def cli_evaluate() -> None:
"""Main CLI entry point.""" """Main CLI entry point."""
setup_logging() setup_logging()
......
...@@ -8,6 +8,8 @@ from functools import partial ...@@ -8,6 +8,8 @@ from functools import partial
from lm_eval._cli.subcommand import SubCommand from lm_eval._cli.subcommand import SubCommand
from lm_eval._cli.utils import ( from lm_eval._cli.utils import (
_int_or_none_list_arg_type, _int_or_none_list_arg_type,
key_val_to_dict,
merge_dicts,
request_caching_arg_to_dict, request_caching_arg_to_dict,
try_parse_json, try_parse_json,
) )
...@@ -22,17 +24,17 @@ class Run(SubCommand): ...@@ -22,17 +24,17 @@ class Run(SubCommand):
"run", "run",
help="Run the evaluation harness on specified tasks", help="Run the evaluation harness on specified tasks",
description="Evaluate language models on various benchmarks and tasks.", description="Evaluate language models on various benchmarks and tasks.",
usage="lm-eval run --model <model> --tasks <task1,task2,...> [options]", usage="lm-eval run --model <model> --tasks <task> <task> --model_args <arg=value> <arg=value> [options]",
epilog=textwrap.dedent(""" epilog=textwrap.dedent("""
examples: examples:
# Basic evaluation with HuggingFace model # Basic evaluation with HuggingFace model
$ lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag $ lm-eval run --model hf --model_args pretrained=gpt2 dtype=float32 --tasks hellaswag
# Evaluate on multiple tasks with few-shot examples # Evaluate on multiple tasks with few-shot examples
$ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy,arc_challenge --num_fewshot 5 $ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy arc_challenge --num_fewshot 5
# Evaluation with custom generation parameters # Evaluation with custom generation parameters
$ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs "temperature=0.8,top_p=0.95" $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8 top_p=0.95 'stop=["\\n\\n"]'
# Use configuration file # Use configuration file
$ lm-eval run --config my_config.yaml --tasks mmlu $ lm-eval run --config my_config.yaml --tasks mmlu
...@@ -73,9 +75,10 @@ class Run(SubCommand): ...@@ -73,9 +75,10 @@ class Run(SubCommand):
"-t", "-t",
default=None, default=None,
type=str, type=str,
metavar="TASK1,TASK2", nargs="*",
metavar="TASK1 TASK2",
help=textwrap.dedent(""" help=textwrap.dedent("""
Comma-separated list of task names or groupings. Space or Comma-separated list of task names or groupings.
Use 'lm-eval list tasks' to see all available tasks. Use 'lm-eval list tasks' to see all available tasks.
""").strip(), """).strip(),
) )
...@@ -83,9 +86,10 @@ class Run(SubCommand): ...@@ -83,9 +86,10 @@ class Run(SubCommand):
"--model_args", "--model_args",
"-a", "-a",
default=None, default=None,
type=try_parse_json, nargs="*",
type=key_val_to_dict,
metavar="ARGS", metavar="ARGS",
help="Model arguments as 'key=val,key2=val2' or JSON string", help="Model arguments as 'key=val,key2=val2' or `key=val` `key2=val2`",
) )
# Evaluation Settings # Evaluation Settings
...@@ -124,10 +128,14 @@ class Run(SubCommand): ...@@ -124,10 +128,14 @@ class Run(SubCommand):
) )
eval_group.add_argument( eval_group.add_argument(
"--gen_kwargs", "--gen_kwargs",
type=try_parse_json, type=key_val_to_dict,
default=None, default=None,
nargs="*",
metavar="KWARGS", metavar="KWARGS",
help="Generation arguments as 'key=val,key2=val2' or JSON string", help=textwrap.dedent(
'Generation arguments as `temperature=0,stop=["stop"]` or `key=val` `key2=val2`.'
"Values should be parsable with ast.literal_eval."
),
) )
# Data and Output # Data and Output
...@@ -160,9 +168,10 @@ class Run(SubCommand): ...@@ -160,9 +168,10 @@ class Run(SubCommand):
"-E", "-E",
default=None, default=None,
type=try_parse_json, type=try_parse_json,
metavar="JSON_FILE", metavar='"task1": [1,2,3,4,...]"',
help=textwrap.dedent( help=textwrap.dedent(
'JSON file with specific sample indices for inputs: {"task_name":[indices],...}. Incompatible with --limit.' "`...` `...` Sample indices for inputs. Incompatible with --limit."
" Values be parsable with ast.literal_eval."
), ),
) )
...@@ -250,24 +259,24 @@ class Run(SubCommand): ...@@ -250,24 +259,24 @@ class Run(SubCommand):
) )
logging_group.add_argument( logging_group.add_argument(
"--wandb_args", "--wandb_args",
type=str, type=key_val_to_dict,
default=argparse.SUPPRESS, default=argparse.SUPPRESS,
metavar="ARGS", metavar="ARGS",
help="Weights & Biases init arguments (key=val,key2=val2)", help="Weights & Biases init arguments key=val key2=val2",
) )
logging_group.add_argument( logging_group.add_argument(
"--wandb_config_args", "--wandb_config_args",
type=str, type=key_val_to_dict,
default=argparse.SUPPRESS, default=argparse.SUPPRESS,
metavar="ARGS", metavar="ARGS",
help="Weights & Biases config arguments (key=val,key2=val2)", help="Weights & Biases config arguments key=val key2=val2",
) )
logging_group.add_argument( logging_group.add_argument(
"--hf_hub_log_args", "--hf_hub_log_args",
type=str, type=key_val_to_dict,
default=argparse.SUPPRESS, default=argparse.SUPPRESS,
metavar="ARGS", metavar="ARGS",
help="Hugging Face Hub logging arguments (key=val,key2=val2)", help="Hugging Face Hub logging arguments key=val key2=val2",
) )
# Advanced Options # Advanced Options
...@@ -307,15 +316,28 @@ class Run(SubCommand): ...@@ -307,15 +316,28 @@ class Run(SubCommand):
"--metadata", "--metadata",
type=json.loads, type=json.loads,
default=None, default=None,
metavar="JSON", metavar="`key=val` `key2=val2`",
help=textwrap.dedent( help=textwrap.dedent(
"""JSON metadata for task configs (merged with model_args), required for some tasks such as RULER""" """`key=val` `key2=val` args parsable by ast.literal_eval (merged with model_args),
required for some tasks such as RULER"""
), ),
) )
def _execute(self, args: argparse.Namespace) -> None: @staticmethod
def _execute(args: argparse.Namespace) -> None:
"""Runs the evaluation harness with the provided arguments.""" """Runs the evaluation harness with the provided arguments."""
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
MERGE_ARGS_DICTS = [
"model_args",
"gen_kwargs",
"wandb_args",
"wandb_config_args",
"hf_hub_log_args",
]
for arg_name in MERGE_ARGS_DICTS:
if current_value := getattr(args, arg_name, None):
setattr(args, arg_name, merge_dicts(*current_value))
from lm_eval.config.evaluate_config import EvaluatorConfig from lm_eval.config.evaluate_config import EvaluatorConfig
eval_logger = logging.getLogger(__name__) eval_logger = logging.getLogger(__name__)
......
...@@ -8,7 +8,6 @@ import re ...@@ -8,7 +8,6 @@ import re
from collections.abc import Callable from collections.abc import Callable
from copy import deepcopy from copy import deepcopy
from functools import cached_property from functools import cached_property
from types import MethodType
from typing import TYPE_CHECKING, Any, Literal, overload from typing import TYPE_CHECKING, Any, Literal, overload
import datasets import datasets
...@@ -523,8 +522,8 @@ class Task(abc.ABC): ...@@ -523,8 +522,8 @@ class Task(abc.ABC):
# self.aggregation = lambda: { # self.aggregation = lambda: {
# metric_name: get_metric_aggregation(metric_name) # metric_name: get_metric_aggregation(metric_name)
# } # }
setattr(self._config, "metric_list", [MetricConfig(name=metric_name)]) self._config.metric_list = [MetricConfig(name=metric_name)]
setattr(self._config, "process_results", lambda *args: {"bypass": 0}) self._config.process_results = lambda *args: {"bypass": 0}
def set_fewshot_seed(self, seed: int | None = None) -> None: def set_fewshot_seed(self, seed: int | None = None) -> None:
self.fewshot_rnd = random.Random(seed) self.fewshot_rnd = random.Random(seed)
...@@ -656,6 +655,18 @@ class ConfigurableTask(Task): ...@@ -656,6 +655,18 @@ class ConfigurableTask(Task):
) )
self.task_docs = self.eval_docs self.task_docs = self.eval_docs
# for name, fn in self.config._fn.items():
# if hasattr(self, name):
# setattr(
# self,
# name,
# types.MethodType(
# lambda self, *args, _fn=fn, **kwargs: _fn(*args, **kwargs),
# self,
# ),
# )
self.runtime_checks(self.task_docs[0])
def download( def download(
self, dataset_kwargs:dict[str, Any] | None = None, **kwargs self, dataset_kwargs:dict[str, Any] | None = None, **kwargs
...@@ -968,6 +979,8 @@ class ConfigurableTask(Task): ...@@ -968,6 +979,8 @@ class ConfigurableTask(Task):
# if self.prompt is not None: # if self.prompt is not None:
# doc_to_text = self.prompt # doc_to_text = self.prompt
doc_to_text = doc_to_text or self.config.doc_to_text doc_to_text = doc_to_text or self.config.doc_to_text
if callable(doc_to_text):
return doc_to_text(doc)
if doc_to_text in doc: if doc_to_text in doc:
return doc[doc_to_text] return doc[doc_to_text]
elif isinstance(doc_to_text, str): elif isinstance(doc_to_text, str):
...@@ -1013,6 +1026,8 @@ class ConfigurableTask(Task): ...@@ -1013,6 +1026,8 @@ class ConfigurableTask(Task):
# if self.prompt is not None: # if self.prompt is not None:
# doc_to_target = self.prompt # doc_to_target = self.prompt
doc_to_target = doc_to_target or self.config.doc_to_target doc_to_target = doc_to_target or self.config.doc_to_target
if callable(doc_to_target):
doc_to_target(doc)
if doc_to_target in doc: if doc_to_target in doc:
return doc[doc_to_target] return doc[doc_to_target]
elif isinstance(doc_to_target, str): elif isinstance(doc_to_target, str):
...@@ -1274,6 +1289,8 @@ class ConfigurableTask(Task): ...@@ -1274,6 +1289,8 @@ class ConfigurableTask(Task):
) )
def process_results(self, doc: dict, results: list) -> dict[str, Any]: def process_results(self, doc: dict, results: list) -> dict[str, Any]:
if callable(self.config.process_results):
return self.config.process_results(doc, results)
result_dict = {} result_dict = {}
use_metric = list(m.metric_name for m in self.config._metric_list) use_metric = list(m.metric_name for m in self.config._metric_list)
if self.OUTPUT_TYPE == "loglikelihood": if self.OUTPUT_TYPE == "loglikelihood":
...@@ -1423,6 +1440,7 @@ class ConfigurableTask(Task): ...@@ -1423,6 +1440,7 @@ class ConfigurableTask(Task):
# Test One Doc # Test One Doc
self.features: list[str] = list(self.task_docs.features.keys()) self.features: list[str] = list(self.task_docs.features.keys())
self.multiple_target = 0 self.multiple_target = 0
self.multiple_input = 0
test_text = self.doc_to_text(test_doc) test_text = self.doc_to_text(test_doc)
test_target = self.doc_to_target(test_doc) test_target = self.doc_to_target(test_doc)
...@@ -1430,13 +1448,19 @@ class ConfigurableTask(Task): ...@@ -1430,13 +1448,19 @@ class ConfigurableTask(Task):
test_choice = self.doc_to_choice(test_doc) test_choice = self.doc_to_choice(test_doc)
if not isinstance(test_choice, list): if not isinstance(test_choice, list):
eval_logger.error("doc_to_choice must return list") eval_logger.error("doc_to_choice must return list")
# else: else:
# num_choice = len(test_choice) num_choice = len(test_choice)
if isinstance(test_text, int):
eval_logger.debug(
"doc_to_text returned an int. Assuming multiple inputs."
)
if isinstance(test_text, int): if isinstance(test_text, int):
eval_logger.debug( eval_logger.debug(
"doc_to_text returned an int. Assuming multiple inputs." "doc_to_text returned an int. Assuming multiple inputs."
) )
self.multiple_input = num_choice
else: else:
test_choice = None test_choice = None
......
...@@ -21,6 +21,7 @@ DICT_KEYS = [ ...@@ -21,6 +21,7 @@ DICT_KEYS = [
"hf_hub_log_args", "hf_hub_log_args",
"metadata", "metadata",
"model_args", "model_args",
"gen_kwargs",
] ]
...@@ -79,7 +80,7 @@ class EvaluatorConfig: ...@@ -79,7 +80,7 @@ class EvaluatorConfig:
# Device # Device
device: Optional[str] = field( device: Optional[str] = field(
default=None, metadata={"help": "Device to use (e.g. cuda, cuda:0, cpu)"} default="cuda:0", metadata={"help": "Device to use (e.g. cuda, cuda:0, cpu)"}
) )
# Data sampling and limiting # Data sampling and limiting
...@@ -126,7 +127,10 @@ class EvaluatorConfig: ...@@ -126,7 +127,10 @@ class EvaluatorConfig:
default=None, metadata={"help": "Custom System instruction to add"} default=None, metadata={"help": "Custom System instruction to add"}
) )
apply_chat_template: Union[bool, str] = field( apply_chat_template: Union[bool, str] = field(
default=False, metadata={"help": "Apply chat template to prompt"} default=False,
metadata={
"help": "Apply chat template to prompt. Either True, or a string identifying the tokenizer template."
},
) )
fewshot_as_multiturn: bool = field( fewshot_as_multiturn: bool = field(
default=False, default=False,
...@@ -170,7 +174,7 @@ class EvaluatorConfig: ...@@ -170,7 +174,7 @@ class EvaluatorConfig:
metadata={"help": "Seeds for random, numpy, torch, fewshot (random)"}, metadata={"help": "Seeds for random, numpy, torch, fewshot (random)"},
) )
# Security and safety # Security
trust_remote_code: bool = field( trust_remote_code: bool = field(
default=False, metadata={"help": "Trust remote code for HF datasets"} default=False, metadata={"help": "Trust remote code for HF datasets"}
) )
...@@ -201,7 +205,7 @@ class EvaluatorConfig: ...@@ -201,7 +205,7 @@ class EvaluatorConfig:
config.update(cls.load_yaml_config(namespace.config)) config.update(cls.load_yaml_config(namespace.config))
# Override with CLI args (only truthy values, exclude non-config args) # Override with CLI args (only truthy values, exclude non-config args)
excluded_args = {"config", "command", "func"} # argparse internal args excluded_args = {"command", "func"} # argparse internal args
cli_args = { cli_args = {
k: v for k, v in vars(namespace).items() if v and k not in excluded_args k: v for k, v in vars(namespace).items() if v and k not in excluded_args
} }
...@@ -252,7 +256,6 @@ class EvaluatorConfig: ...@@ -252,7 +256,6 @@ class EvaluatorConfig:
try: try:
yaml_data = yaml.safe_load(config_file.read_text()) yaml_data = yaml.safe_load(config_file.read_text())
print(textwrap.dedent(f"""yaml: {yaml_data}"""))
except yaml.YAMLError as e: except yaml.YAMLError as e:
raise ValueError(f"Invalid YAML in {config_path}: {e}") raise ValueError(f"Invalid YAML in {config_path}: {e}")
except (OSError, UnicodeDecodeError) as e: except (OSError, UnicodeDecodeError) as e:
...@@ -337,17 +340,10 @@ class EvaluatorConfig: ...@@ -337,17 +340,10 @@ class EvaluatorConfig:
metadata=self.metadata if self.metadata else {}, metadata=self.metadata if self.metadata else {},
) )
# self.tasks is a comma-separated string of task names task_names = task_manager.match_tasks(self.tasks)
if isinstance((task_list := self.tasks), str):
task_list = self.tasks.split(",")
else:
assert isinstance(self.tasks, list), (
"`tasks` must be a comma delimited string of task names or list[str]."
)
task_names = task_manager.match_tasks(task_list)
# Check for any individual task files in the list # Check for any individual task files in the list
for task in [task for task in task_list if task not in task_names]: for task in [task for task in self.tasks if task not in task_names]:
task_path = Path(task) task_path = Path(task)
if task_path.is_file(): if task_path.is_file():
config = utils.load_yaml_config(str(task_path)) config = utils.load_yaml_config(str(task_path))
...@@ -355,7 +351,7 @@ class EvaluatorConfig: ...@@ -355,7 +351,7 @@ class EvaluatorConfig:
# Check for missing tasks # Check for missing tasks
task_missing = [ task_missing = [
task for task in task_list if task not in task_names and "*" not in task task for task in self.tasks if task not in task_names and "*" not in task
] ]
if task_missing: if task_missing:
......
...@@ -38,7 +38,7 @@ class MetricConfig: ...@@ -38,7 +38,7 @@ class MetricConfig:
return is_higher_better(self.name) return is_higher_better(self.name)
return self.higher_is_better return self.higher_is_better
def compute_metric(self, *args, **kwargs) -> Any: def compute(self, *args, **kwargs) -> Any:
"""Calculates the metric using the provided function and arguments.""" """Calculates the metric using the provided function and arguments."""
if self.fn is None: if self.fn is None:
raise ValueError(f"Metric function for {self.name} is not defined.") raise ValueError(f"Metric function for {self.name} is not defined.")
......
...@@ -10,7 +10,7 @@ import datasets ...@@ -10,7 +10,7 @@ import datasets
from lm_eval.api.filter import FilterEnsemble from lm_eval.api.filter import FilterEnsemble
from lm_eval.api.instance import OutputType from lm_eval.api.instance import OutputType
from lm_eval.config.metric import MetricConfig from lm_eval.config.metric import MetricConfig
from lm_eval.config.utils import doc_to_closure, maybe_serialize from lm_eval.config.utils import maybe_serialize
if TYPE_CHECKING: if TYPE_CHECKING:
...@@ -364,7 +364,7 @@ class TaskConfig: ...@@ -364,7 +364,7 @@ class TaskConfig:
@classmethod @classmethod
def from_yaml(cls, data: dict[str, Any]) -> TaskConfig: def from_yaml(cls, data: dict[str, Any]) -> TaskConfig:
"""Create a TaskConfig instance from a YAML-like dictionary.""" """Create a TaskConfig instance from a YAML-like dictionary."""
fn = {k: doc_to_closure(v) for k, v in data.items() if callable(v)} fn = {k: v for k, v in data.items() if callable(v)}
return cls(**data, _fn=fn) return cls(**data, _fn=fn)
@classmethod @classmethod
......
...@@ -475,7 +475,9 @@ def evaluate( ...@@ -475,7 +475,9 @@ def evaluate(
"Either 'limit' or 'samples' must be None, but both are not None." "Either 'limit' or 'samples' must be None, but both are not None."
) )
if samples is not None: if samples is not None:
eval_logger.info(f"Evaluating examples for tasks {list(samples.keys())}") eval_logger.info(
f"Evaluating examples for tasks {[x for x in list(samples.keys()) if x in task_dict.keys()]}"
)
if apply_chat_template: if apply_chat_template:
eval_logger.warning( eval_logger.warning(
"Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details." "Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
......
...@@ -11,10 +11,10 @@ authors = [ ...@@ -11,10 +11,10 @@ authors = [
description = "A framework for evaluating language models" description = "A framework for evaluating language models"
readme = "README.md" readme = "README.md"
classifiers = [ classifiers = [
"Development Status :: 3 - Alpha", "Development Status :: 3 - Alpha",
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License", "License :: OSI Approved :: MIT License",
"Operating System :: OS Independent" "Operating System :: OS Independent",
] ]
requires-python = ">=3.9" requires-python = ">=3.9"
license = { "text" = "MIT" } license = { "text" = "MIT" }
......
...@@ -4,11 +4,12 @@ ...@@ -4,11 +4,12 @@
# instead of passing them as command-line arguments. # instead of passing them as command-line arguments.
# #
# Usage: # Usage:
# $ lm_eval --config configs/default_config.yaml # $ lm_eval --config templates/example_ci_config.yaml
# #
# You can override any values in this config with command-line arguments: # You can override any values in this config with further command-line arguments:
# $ lm_eval --config configs/default_config.yaml --model_args pretrained=gpt2 --tasks mmlu # $ lm_eval --config templates/example_ci_config.yaml --model_args pretrained=gpt2 --tasks mmlu
# #
# For expected types and values, refer to EvaluatorConfig in lm_eval/config/evaluate_config.py
# All parameters are optional and have the same meaning as their CLI counterparts. # All parameters are optional and have the same meaning as their CLI counterparts.
model: hf model: hf
...@@ -17,9 +18,18 @@ model_args: ...@@ -17,9 +18,18 @@ model_args:
dtype: float16 dtype: float16
tasks: tasks:
- hellaswag - hellaswag
- gsm8k - arc_easy
batch_size: 1 batch_size: 1
trust_remote_code: true trust_remote_code: true
log_samples: true log_samples: true
output_path: ./test output_path: ./test
limit: 10 gen_kwargs:
do_sample: true
temperature: 0.7
stop: ["\n", "<|endoftext|>"]
samples:
hellaswag: [1,2,3,4,5,6,7,8,9,10]
arc_easy: [10,20,30,40,50,60,70,80,90,100]
metadata:
name: Example CI Config
description: This is an example configuration file for testing purposes.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment