"torchvision/datapoints/_bounding_box.py" did not exist on "4cb83c2f285101f83b5143663e0d90305e9d7200"
Commit b89af51e authored by Baber's avatar Baber
Browse files

update default values; fixes

parent fadd26e4
......@@ -29,12 +29,11 @@ repos:
- id: mixed-line-ending
args: [--fix=lf]
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.12.2
rev: v0.12.5
hooks:
# Run the linter.
- id: ruff-check
args: [ --fix]
# Run the formatter.
args: [--fix]
- id: ruff-format
- repo: https://github.com/codespell-project/codespell
rev: v2.4.1
......
from rich.traceback import install
from lm_eval._cli.harness import HarnessCLI
from lm_eval.utils import setup_logging
install(show_locals=True)
def cli_evaluate() -> None:
"""Main CLI entry point."""
setup_logging()
......
......@@ -8,6 +8,8 @@ from functools import partial
from lm_eval._cli.subcommand import SubCommand
from lm_eval._cli.utils import (
_int_or_none_list_arg_type,
key_val_to_dict,
merge_dicts,
request_caching_arg_to_dict,
try_parse_json,
)
......@@ -22,17 +24,17 @@ class Run(SubCommand):
"run",
help="Run the evaluation harness on specified tasks",
description="Evaluate language models on various benchmarks and tasks.",
usage="lm-eval run --model <model> --tasks <task1,task2,...> [options]",
usage="lm-eval run --model <model> --tasks <task> <task> --model_args <arg=value> <arg=value> [options]",
epilog=textwrap.dedent("""
examples:
# Basic evaluation with HuggingFace model
$ lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag
$ lm-eval run --model hf --model_args pretrained=gpt2 dtype=float32 --tasks hellaswag
# Evaluate on multiple tasks with few-shot examples
$ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy,arc_challenge --num_fewshot 5
$ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy arc_challenge --num_fewshot 5
# Evaluation with custom generation parameters
$ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs "temperature=0.8,top_p=0.95"
$ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8 top_p=0.95 'stop=["\\n\\n"]'
# Use configuration file
$ lm-eval run --config my_config.yaml --tasks mmlu
......@@ -73,9 +75,10 @@ class Run(SubCommand):
"-t",
default=None,
type=str,
metavar="TASK1,TASK2",
nargs="*",
metavar="TASK1 TASK2",
help=textwrap.dedent("""
Comma-separated list of task names or groupings.
Space or Comma-separated list of task names or groupings.
Use 'lm-eval list tasks' to see all available tasks.
""").strip(),
)
......@@ -83,9 +86,10 @@ class Run(SubCommand):
"--model_args",
"-a",
default=None,
type=try_parse_json,
nargs="*",
type=key_val_to_dict,
metavar="ARGS",
help="Model arguments as 'key=val,key2=val2' or JSON string",
help="Model arguments as 'key=val,key2=val2' or `key=val` `key2=val2`",
)
# Evaluation Settings
......@@ -124,10 +128,14 @@ class Run(SubCommand):
)
eval_group.add_argument(
"--gen_kwargs",
type=try_parse_json,
type=key_val_to_dict,
default=None,
nargs="*",
metavar="KWARGS",
help="Generation arguments as 'key=val,key2=val2' or JSON string",
help=textwrap.dedent(
'Generation arguments as `temperature=0,stop=["stop"]` or `key=val` `key2=val2`.'
"Values should be parsable with ast.literal_eval."
),
)
# Data and Output
......@@ -160,9 +168,10 @@ class Run(SubCommand):
"-E",
default=None,
type=try_parse_json,
metavar="JSON_FILE",
metavar='"task1": [1,2,3,4,...]"',
help=textwrap.dedent(
'JSON file with specific sample indices for inputs: {"task_name":[indices],...}. Incompatible with --limit.'
"`...` `...` Sample indices for inputs. Incompatible with --limit."
" Values be parsable with ast.literal_eval."
),
)
......@@ -250,24 +259,24 @@ class Run(SubCommand):
)
logging_group.add_argument(
"--wandb_args",
type=str,
type=key_val_to_dict,
default=argparse.SUPPRESS,
metavar="ARGS",
help="Weights & Biases init arguments (key=val,key2=val2)",
help="Weights & Biases init arguments key=val key2=val2",
)
logging_group.add_argument(
"--wandb_config_args",
type=str,
type=key_val_to_dict,
default=argparse.SUPPRESS,
metavar="ARGS",
help="Weights & Biases config arguments (key=val,key2=val2)",
help="Weights & Biases config arguments key=val key2=val2",
)
logging_group.add_argument(
"--hf_hub_log_args",
type=str,
type=key_val_to_dict,
default=argparse.SUPPRESS,
metavar="ARGS",
help="Hugging Face Hub logging arguments (key=val,key2=val2)",
help="Hugging Face Hub logging arguments key=val key2=val2",
)
# Advanced Options
......@@ -307,15 +316,28 @@ class Run(SubCommand):
"--metadata",
type=json.loads,
default=None,
metavar="JSON",
metavar="`key=val` `key2=val2`",
help=textwrap.dedent(
"""JSON metadata for task configs (merged with model_args), required for some tasks such as RULER"""
"""`key=val` `key2=val` args parsable by ast.literal_eval (merged with model_args),
required for some tasks such as RULER"""
),
)
def _execute(self, args: argparse.Namespace) -> None:
@staticmethod
def _execute(args: argparse.Namespace) -> None:
"""Runs the evaluation harness with the provided arguments."""
os.environ["TOKENIZERS_PARALLELISM"] = "false"
MERGE_ARGS_DICTS = [
"model_args",
"gen_kwargs",
"wandb_args",
"wandb_config_args",
"hf_hub_log_args",
]
for arg_name in MERGE_ARGS_DICTS:
if current_value := getattr(args, arg_name, None):
setattr(args, arg_name, merge_dicts(*current_value))
from lm_eval.config.evaluate_config import EvaluatorConfig
eval_logger = logging.getLogger(__name__)
......
......@@ -8,7 +8,6 @@ import re
from collections.abc import Callable
from copy import deepcopy
from functools import cached_property
from types import MethodType
from typing import TYPE_CHECKING, Any, Literal, overload
import datasets
......@@ -523,8 +522,8 @@ class Task(abc.ABC):
# self.aggregation = lambda: {
# metric_name: get_metric_aggregation(metric_name)
# }
setattr(self._config, "metric_list", [MetricConfig(name=metric_name)])
setattr(self._config, "process_results", lambda *args: {"bypass": 0})
self._config.metric_list = [MetricConfig(name=metric_name)]
self._config.process_results = lambda *args: {"bypass": 0}
def set_fewshot_seed(self, seed: int | None = None) -> None:
self.fewshot_rnd = random.Random(seed)
......@@ -656,6 +655,18 @@ class ConfigurableTask(Task):
)
self.task_docs = self.eval_docs
# for name, fn in self.config._fn.items():
# if hasattr(self, name):
# setattr(
# self,
# name,
# types.MethodType(
# lambda self, *args, _fn=fn, **kwargs: _fn(*args, **kwargs),
# self,
# ),
# )
self.runtime_checks(self.task_docs[0])
def download(
self, dataset_kwargs:dict[str, Any] | None = None, **kwargs
......@@ -968,6 +979,8 @@ class ConfigurableTask(Task):
# if self.prompt is not None:
# doc_to_text = self.prompt
doc_to_text = doc_to_text or self.config.doc_to_text
if callable(doc_to_text):
return doc_to_text(doc)
if doc_to_text in doc:
return doc[doc_to_text]
elif isinstance(doc_to_text, str):
......@@ -1013,6 +1026,8 @@ class ConfigurableTask(Task):
# if self.prompt is not None:
# doc_to_target = self.prompt
doc_to_target = doc_to_target or self.config.doc_to_target
if callable(doc_to_target):
doc_to_target(doc)
if doc_to_target in doc:
return doc[doc_to_target]
elif isinstance(doc_to_target, str):
......@@ -1274,6 +1289,8 @@ class ConfigurableTask(Task):
)
def process_results(self, doc: dict, results: list) -> dict[str, Any]:
if callable(self.config.process_results):
return self.config.process_results(doc, results)
result_dict = {}
use_metric = list(m.metric_name for m in self.config._metric_list)
if self.OUTPUT_TYPE == "loglikelihood":
......@@ -1423,6 +1440,7 @@ class ConfigurableTask(Task):
# Test One Doc
self.features: list[str] = list(self.task_docs.features.keys())
self.multiple_target = 0
self.multiple_input = 0
test_text = self.doc_to_text(test_doc)
test_target = self.doc_to_target(test_doc)
......@@ -1430,13 +1448,19 @@ class ConfigurableTask(Task):
test_choice = self.doc_to_choice(test_doc)
if not isinstance(test_choice, list):
eval_logger.error("doc_to_choice must return list")
# else:
# num_choice = len(test_choice)
else:
num_choice = len(test_choice)
if isinstance(test_text, int):
eval_logger.debug(
"doc_to_text returned an int. Assuming multiple inputs."
)
if isinstance(test_text, int):
eval_logger.debug(
"doc_to_text returned an int. Assuming multiple inputs."
)
self.multiple_input = num_choice
else:
test_choice = None
......
......@@ -21,6 +21,7 @@ DICT_KEYS = [
"hf_hub_log_args",
"metadata",
"model_args",
"gen_kwargs",
]
......@@ -79,7 +80,7 @@ class EvaluatorConfig:
# Device
device: Optional[str] = field(
default=None, metadata={"help": "Device to use (e.g. cuda, cuda:0, cpu)"}
default="cuda:0", metadata={"help": "Device to use (e.g. cuda, cuda:0, cpu)"}
)
# Data sampling and limiting
......@@ -126,7 +127,10 @@ class EvaluatorConfig:
default=None, metadata={"help": "Custom System instruction to add"}
)
apply_chat_template: Union[bool, str] = field(
default=False, metadata={"help": "Apply chat template to prompt"}
default=False,
metadata={
"help": "Apply chat template to prompt. Either True, or a string identifying the tokenizer template."
},
)
fewshot_as_multiturn: bool = field(
default=False,
......@@ -170,7 +174,7 @@ class EvaluatorConfig:
metadata={"help": "Seeds for random, numpy, torch, fewshot (random)"},
)
# Security and safety
# Security
trust_remote_code: bool = field(
default=False, metadata={"help": "Trust remote code for HF datasets"}
)
......@@ -201,7 +205,7 @@ class EvaluatorConfig:
config.update(cls.load_yaml_config(namespace.config))
# Override with CLI args (only truthy values, exclude non-config args)
excluded_args = {"config", "command", "func"} # argparse internal args
excluded_args = {"command", "func"} # argparse internal args
cli_args = {
k: v for k, v in vars(namespace).items() if v and k not in excluded_args
}
......@@ -252,7 +256,6 @@ class EvaluatorConfig:
try:
yaml_data = yaml.safe_load(config_file.read_text())
print(textwrap.dedent(f"""yaml: {yaml_data}"""))
except yaml.YAMLError as e:
raise ValueError(f"Invalid YAML in {config_path}: {e}")
except (OSError, UnicodeDecodeError) as e:
......@@ -337,17 +340,10 @@ class EvaluatorConfig:
metadata=self.metadata if self.metadata else {},
)
# self.tasks is a comma-separated string of task names
if isinstance((task_list := self.tasks), str):
task_list = self.tasks.split(",")
else:
assert isinstance(self.tasks, list), (
"`tasks` must be a comma delimited string of task names or list[str]."
)
task_names = task_manager.match_tasks(task_list)
task_names = task_manager.match_tasks(self.tasks)
# Check for any individual task files in the list
for task in [task for task in task_list if task not in task_names]:
for task in [task for task in self.tasks if task not in task_names]:
task_path = Path(task)
if task_path.is_file():
config = utils.load_yaml_config(str(task_path))
......@@ -355,7 +351,7 @@ class EvaluatorConfig:
# Check for missing tasks
task_missing = [
task for task in task_list if task not in task_names and "*" not in task
task for task in self.tasks if task not in task_names and "*" not in task
]
if task_missing:
......
......@@ -38,7 +38,7 @@ class MetricConfig:
return is_higher_better(self.name)
return self.higher_is_better
def compute_metric(self, *args, **kwargs) -> Any:
def compute(self, *args, **kwargs) -> Any:
"""Calculates the metric using the provided function and arguments."""
if self.fn is None:
raise ValueError(f"Metric function for {self.name} is not defined.")
......
......@@ -10,7 +10,7 @@ import datasets
from lm_eval.api.filter import FilterEnsemble
from lm_eval.api.instance import OutputType
from lm_eval.config.metric import MetricConfig
from lm_eval.config.utils import doc_to_closure, maybe_serialize
from lm_eval.config.utils import maybe_serialize
if TYPE_CHECKING:
......@@ -364,7 +364,7 @@ class TaskConfig:
@classmethod
def from_yaml(cls, data: dict[str, Any]) -> TaskConfig:
"""Create a TaskConfig instance from a YAML-like dictionary."""
fn = {k: doc_to_closure(v) for k, v in data.items() if callable(v)}
fn = {k: v for k, v in data.items() if callable(v)}
return cls(**data, _fn=fn)
@classmethod
......
......@@ -475,7 +475,9 @@ def evaluate(
"Either 'limit' or 'samples' must be None, but both are not None."
)
if samples is not None:
eval_logger.info(f"Evaluating examples for tasks {list(samples.keys())}")
eval_logger.info(
f"Evaluating examples for tasks {[x for x in list(samples.keys()) if x in task_dict.keys()]}"
)
if apply_chat_template:
eval_logger.warning(
"Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
......
......@@ -11,10 +11,10 @@ authors = [
description = "A framework for evaluating language models"
readme = "README.md"
classifiers = [
"Development Status :: 3 - Alpha",
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent"
"Development Status :: 3 - Alpha",
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
requires-python = ">=3.9"
license = { "text" = "MIT" }
......
......@@ -4,11 +4,12 @@
# instead of passing them as command-line arguments.
#
# Usage:
# $ lm_eval --config configs/default_config.yaml
# $ lm_eval --config templates/example_ci_config.yaml
#
# You can override any values in this config with command-line arguments:
# $ lm_eval --config configs/default_config.yaml --model_args pretrained=gpt2 --tasks mmlu
# You can override any values in this config with further command-line arguments:
# $ lm_eval --config templates/example_ci_config.yaml --model_args pretrained=gpt2 --tasks mmlu
#
# For expected types and values, refer to EvaluatorConfig in lm_eval/config/evaluate_config.py
# All parameters are optional and have the same meaning as their CLI counterparts.
model: hf
......@@ -17,9 +18,18 @@ model_args:
dtype: float16
tasks:
- hellaswag
- gsm8k
- arc_easy
batch_size: 1
trust_remote_code: true
log_samples: true
output_path: ./test
limit: 10
gen_kwargs:
do_sample: true
temperature: 0.7
stop: ["\n", "<|endoftext|>"]
samples:
hellaswag: [1,2,3,4,5,6,7,8,9,10]
arc_easy: [10,20,30,40,50,60,70,80,90,100]
metadata:
name: Example CI Config
description: This is an example configuration file for testing purposes.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment